youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     uploader:       Nickname of the video uploader, unescaped.
  36     upload_date:    Video upload date (YYYYMMDD).
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     player_url:     SWF Player URL (used for rtmpdump).
  46     subtitles:      The .srt file contents.
  47     urlhandle:      [internal] The urlHandle to be used to download the file,
  48                     like returned by urllib.request.urlopen
  49
  50     The fields should all be Unicode strings.
  51
  52     Subclasses of this one should re-define the _real_initialize() and
  53     _real_extract() methods and define a _VALID_URL regexp.
  54     Probably, they should also be added to the list of extractors.
  55
  56     _real_extract() must return a *list* of information dictionaries as
  57     described above.
  58
  59     Finally, the _WORKING attribute should be set to False for broken IEs
  60     in order to warn the users and skip the tests.
  61     """
  62
  63     _ready = False
  64     _downloader = None
  65     _WORKING = True
  66
  67     def __init__(self, downloader=None):
  68         """Constructor. Receives an optional downloader."""
  69         self._ready = False
  70         self.set_downloader(downloader)
  71
  72     def suitable(self, url):
  73         """Receives a URL and returns True if suitable for this IE."""
  74         return re.match(self._VALID_URL, url) is not None
  75
  76     def working(self):
  77         """Getter method for _WORKING."""
  78         return self._WORKING
  79
  80     def initialize(self):
  81         """Initializes an instance (authentication, etc)."""
  82         if not self._ready:
  83             self._real_initialize()
  84             self._ready = True
  85
  86     def extract(self, url):
  87         """Extracts URL information and returns it in list of dicts."""
  88         self.initialize()
  89         return self._real_extract(url)
  90
  91     def set_downloader(self, downloader):
  92         """Sets the downloader for this IE."""
  93         self._downloader = downloader
  94
  95     def _real_initialize(self):
  96         """Real initialization process. Redefine in subclasses."""
  97         pass
  98
  99     def _real_extract(self, url):
 100         """Real extraction process. Redefine in subclasses."""
 101         pass
 102
 103
 104 class YoutubeIE(InfoExtractor):
 105     """Information extractor for youtube.com."""
 106
 107     _VALID_URL = r"""^
 108                      (
 109                          (?:https?://)?                                       # http(s):// (optional)
 110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 114                          (?:                                                  # the various things that can precede the ID:
 115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 116                              |(?:                                             # or the v= param in all its forms
 117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 120                                  v=
 121                              )
 122                          )?                                                   # optional -> youtube.com/xxxx is OK
 123                      )?                                                       # all until now is optional -> you can pass the naked ID
 124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 125                      (?(1).+)?                                                # if we found the ID, everything can follow
 126                      $"""
 127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 131     _NETRC_MACHINE = 'youtube'
 132     # Listed in order of quality
 133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 135     _video_extensions = {
 136         '13': '3gp',
 137         '17': 'mp4',
 138         '18': 'mp4',
 139         '22': 'mp4',
 140         '37': 'mp4',
 141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 142         '43': 'webm',
 143         '44': 'webm',
 144         '45': 'webm',
 145         '46': 'webm',
 146     }
 147     _video_dimensions = {
 148         '5': '240x400',
 149         '6': '???',
 150         '13': '???',
 151         '17': '144x176',
 152         '18': '360x640',
 153         '22': '720x1280',
 154         '34': '360x640',
 155         '35': '480x854',
 156         '37': '1080x1920',
 157         '38': '3072x4096',
 158         '43': '360x640',
 159         '44': '480x854',
 160         '45': '720x1280',
 161         '46': '1080x1920',
 162     }
 163     IE_NAME = u'youtube'
 164
 165     def suitable(self, url):
 166         """Receives a URL and returns True if suitable for this IE."""
 167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 168
 169     def report_lang(self):
 170         """Report attempt to set language."""
 171         self._downloader.to_screen(u'[youtube] Setting language')
 172
 173     def report_login(self):
 174         """Report attempt to log in."""
 175         self._downloader.to_screen(u'[youtube] Logging in')
 176
 177     def report_age_confirmation(self):
 178         """Report attempt to confirm age."""
 179         self._downloader.to_screen(u'[youtube] Confirming age')
 180
 181     def report_video_webpage_download(self, video_id):
 182         """Report attempt to download video webpage."""
 183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 184
 185     def report_video_info_webpage_download(self, video_id):
 186         """Report attempt to download video info webpage."""
 187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 188
 189     def report_video_subtitles_download(self, video_id):
 190         """Report attempt to download video info webpage."""
 191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 192
 193     def report_information_extraction(self, video_id):
 194         """Report attempt to extract video information."""
 195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 196
 197     def report_unavailable_format(self, video_id, format):
 198         """Report extracted video URL."""
 199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 200
 201     def report_rtmp_download(self):
 202         """Indicate the download will use the RTMP protocol."""
 203         self._downloader.to_screen(u'[youtube] RTMP download detected')
 204
 205     def _closed_captions_xml_to_srt(self, xml_string):
 206         srt = ''
 207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 208         # TODO parse xml instead of regex
 209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 210             if not dur: dur = '4'
 211             start = float(start)
 212             end = start + float(dur)
 213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 215             caption = unescapeHTML(caption)
 216             caption = unescapeHTML(caption) # double cycle, intentional
 217             srt += str(n+1) + '\n'
 218             srt += start + ' --> ' + end + '\n'
 219             srt += caption + '\n\n'
 220         return srt
 221
 222     def _print_formats(self, formats):
 223         print('Available formats:')
 224         for x in formats:
 225             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 226
 227     def _real_initialize(self):
 228         if self._downloader is None:
 229             return
 230
 231         username = None
 232         password = None
 233         downloader_params = self._downloader.params
 234
 235         # Attempt to use provided username and password or .netrc data
 236         if downloader_params.get('username', None) is not None:
 237             username = downloader_params['username']
 238             password = downloader_params['password']
 239         elif downloader_params.get('usenetrc', False):
 240             try:
 241                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 242                 if info is not None:
 243                     username = info[0]
 244                     password = info[2]
 245                 else:
 246                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 247             except (IOError, netrc.NetrcParseError) as err:
 248                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 249                 return
 250
 251         # Set language
 252         request = compat_urllib_request.Request(self._LANG_URL)
 253         try:
 254             self.report_lang()
 255             compat_urllib_request.urlopen(request).read()
 256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 257             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 258             return
 259
 260         # No authentication to be performed
 261         if username is None:
 262             return
 263
 264         # Log in
 265         login_form = {
 266                 'current_form': 'loginForm',
 267                 'next':     '/',
 268                 'action_login': 'Log In',
 269                 'username': username,
 270                 'password': password,
 271                 }
 272         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 273         try:
 274             self.report_login()
 275             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 276             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 277                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 278                 return
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 281             return
 282
 283         # Confirm age
 284         age_form = {
 285                 'next_url':     '/',
 286                 'action_confirm':   'Confirm',
 287                 }
 288         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 289         try:
 290             self.report_age_confirmation()
 291             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 293             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 294             return
 295
 296     def _real_extract(self, url):
 297         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 298         mobj = re.search(self._NEXT_URL_RE, url)
 299         if mobj:
 300             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 301
 302         # Extract video id from URL
 303         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 304         if mobj is None:
 305             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 306             return
 307         video_id = mobj.group(2)
 308
 309         # Get video webpage
 310         self.report_video_webpage_download(video_id)
 311         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 312         try:
 313             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 315             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 316             return
 317
 318         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 319
 320         # Attempt to extract SWF player URL
 321         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 322         if mobj is not None:
 323             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 324         else:
 325             player_url = None
 326
 327         # Get video info
 328         self.report_video_info_webpage_download(video_id)
 329         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 330             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 331                     % (video_id, el_type))
 332             request = compat_urllib_request.Request(video_info_url)
 333             try:
 334                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 335                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 336                 video_info = compat_parse_qs(video_info_webpage)
 337                 if 'token' in video_info:
 338                     break
 339             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 340                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 341                 return
 342         if 'token' not in video_info:
 343             if 'reason' in video_info:
 344                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 345             else:
 346                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 347             return
 348
 349         # Check for "rental" videos
 350         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 351             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 352             return
 353
 354         # Start extracting information
 355         self.report_information_extraction(video_id)
 356
 357         # uploader
 358         if 'author' not in video_info:
 359             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 360             return
 361         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 362
 363         # title
 364         if 'title' not in video_info:
 365             self._downloader.trouble(u'ERROR: unable to extract video title')
 366             return
 367         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 368
 369         # thumbnail image
 370         if 'thumbnail_url' not in video_info:
 371             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 372             video_thumbnail = ''
 373         else:   # don't panic if we can't find it
 374             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 375
 376         # upload date
 377         upload_date = None
 378         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 379         if mobj is not None:
 380             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 381             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 382             for expression in format_expressions:
 383                 try:
 384                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 385                 except:
 386                     pass
 387
 388         # description
 389         video_description = get_element_by_id("eow-description", video_webpage)
 390         if video_description:
 391             video_description = clean_html(video_description)
 392         else:
 393             video_description = ''
 394
 395         # closed captions
 396         video_subtitles = None
 397         if self._downloader.params.get('writesubtitles', False):
 398             try:
 399                 self.report_video_subtitles_download(video_id)
 400                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 401                 try:
 402                     srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 403                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 405                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 406                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 407                 if not srt_lang_list:
 408                     raise Trouble(u'WARNING: video has no closed captions')
 409                 if self._downloader.params.get('subtitleslang', False):
 410                     srt_lang = self._downloader.params.get('subtitleslang')
 411                 elif 'en' in srt_lang_list:
 412                     srt_lang = 'en'
 413                 else:
 414                     srt_lang = srt_lang_list.keys()[0]
 415                 if not srt_lang in srt_lang_list:
 416                     raise Trouble(u'WARNING: no closed captions found in the specified language')
 417                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 418                 try:
 419                     srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 420                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 421                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 422                 if not srt_xml:
 423                     raise Trouble(u'WARNING: unable to download video subtitles')
 424                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
 425             except Trouble as trouble:
 426                 self._downloader.trouble(str(trouble))
 427
 428         if 'length_seconds' not in video_info:
 429             self._downloader.trouble(u'WARNING: unable to extract video duration')
 430             video_duration = ''
 431         else:
 432             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 433
 434         # token
 435         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 436
 437         # Decide which formats to download
 438         req_format = self._downloader.params.get('format', None)
 439
 440         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 441             self.report_rtmp_download()
 442             video_url_list = [(None, video_info['conn'][0])]
 443         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 444             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 445             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 446             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 447             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 448
 449             format_limit = self._downloader.params.get('format_limit', None)
 450             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 451             if format_limit is not None and format_limit in available_formats:
 452                 format_list = available_formats[available_formats.index(format_limit):]
 453             else:
 454                 format_list = available_formats
 455             existing_formats = [x for x in format_list if x in url_map]
 456             if len(existing_formats) == 0:
 457                 self._downloader.trouble(u'ERROR: no known formats available for video')
 458                 return
 459             if self._downloader.params.get('listformats', None):
 460                 self._print_formats(existing_formats)
 461                 return
 462             if req_format is None or req_format == 'best':
 463                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 464             elif req_format == 'worst':
 465                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 466             elif req_format in ('-1', 'all'):
 467                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 468             else:
 469                 # Specific formats. We pick the first in a slash-delimeted sequence.
 470                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 471                 req_formats = req_format.split('/')
 472                 video_url_list = None
 473                 for rf in req_formats:
 474                     if rf in url_map:
 475                         video_url_list = [(rf, url_map[rf])]
 476                         break
 477                 if video_url_list is None:
 478                     self._downloader.trouble(u'ERROR: requested format not available')
 479                     return
 480         else:
 481             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 482             return
 483
 484         results = []
 485         for format_param, video_real_url in video_url_list:
 486             # Extension
 487             video_extension = self._video_extensions.get(format_param, 'flv')
 488
 489             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 490                                               self._video_dimensions.get(format_param, '???'))
 491
 492             results.append({
 493                 'id':       video_id,
 494                 'url':      video_real_url,
 495                 'uploader': video_uploader,
 496                 'upload_date':  upload_date,
 497                 'title':    video_title,
 498                 'ext':      video_extension,
 499                 'format':   video_format,
 500                 'thumbnail':    video_thumbnail,
 501                 'description':  video_description,
 502                 'player_url':   player_url,
 503                 'subtitles':    video_subtitles,
 504                 'duration':     video_duration
 505             })
 506         return results
 507
 508
 509 class MetacafeIE(InfoExtractor):
 510     """Information Extractor for metacafe.com."""
 511
 512     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 513     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 514     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 515     IE_NAME = u'metacafe'
 516
 517     def __init__(self, downloader=None):
 518         InfoExtractor.__init__(self, downloader)
 519
 520     def report_disclaimer(self):
 521         """Report disclaimer retrieval."""
 522         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 523
 524     def report_age_confirmation(self):
 525         """Report attempt to confirm age."""
 526         self._downloader.to_screen(u'[metacafe] Confirming age')
 527
 528     def report_download_webpage(self, video_id):
 529         """Report webpage download."""
 530         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 531
 532     def report_extraction(self, video_id):
 533         """Report information extraction."""
 534         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 535
 536     def _real_initialize(self):
 537         # Retrieve disclaimer
 538         request = compat_urllib_request.Request(self._DISCLAIMER)
 539         try:
 540             self.report_disclaimer()
 541             disclaimer = compat_urllib_request.urlopen(request).read()
 542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 543             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 544             return
 545
 546         # Confirm age
 547         disclaimer_form = {
 548             'filters': '0',
 549             'submit': "Continue - I'm over 18",
 550             }
 551         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 552         try:
 553             self.report_age_confirmation()
 554             disclaimer = compat_urllib_request.urlopen(request).read()
 555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 556             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 557             return
 558
 559     def _real_extract(self, url):
 560         # Extract id and simplified title from URL
 561         mobj = re.match(self._VALID_URL, url)
 562         if mobj is None:
 563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 564             return
 565
 566         video_id = mobj.group(1)
 567
 568         # Check if video comes from YouTube
 569         mobj2 = re.match(r'^yt-(.*)$', video_id)
 570         if mobj2 is not None:
 571             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 572             return
 573
 574         # Retrieve video webpage to extract further information
 575         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 576         try:
 577             self.report_download_webpage(video_id)
 578             webpage = compat_urllib_request.urlopen(request).read()
 579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 580             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 581             return
 582
 583         # Extract URL, uploader and title from webpage
 584         self.report_extraction(video_id)
 585         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 586         if mobj is not None:
 587             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 588             video_extension = mediaURL[-3:]
 589
 590             # Extract gdaKey if available
 591             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 592             if mobj is None:
 593                 video_url = mediaURL
 594             else:
 595                 gdaKey = mobj.group(1)
 596                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 597         else:
 598             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 599             if mobj is None:
 600                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 601                 return
 602             vardict = compat_parse_qs(mobj.group(1))
 603             if 'mediaData' not in vardict:
 604                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 605                 return
 606             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 607             if mobj is None:
 608                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 609                 return
 610             mediaURL = mobj.group(1).replace('\\/', '/')
 611             video_extension = mediaURL[-3:]
 612             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 613
 614         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 615         if mobj is None:
 616             self._downloader.trouble(u'ERROR: unable to extract title')
 617             return
 618         video_title = mobj.group(1).decode('utf-8')
 619
 620         mobj = re.search(r'submitter=(.*?);', webpage)
 621         if mobj is None:
 622             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 623             return
 624         video_uploader = mobj.group(1)
 625
 626         return [{
 627             'id':       video_id.decode('utf-8'),
 628             'url':      video_url.decode('utf-8'),
 629             'uploader': video_uploader.decode('utf-8'),
 630             'upload_date':  None,
 631             'title':    video_title,
 632             'ext':      video_extension.decode('utf-8'),
 633         }]
 634
 635
 636 class DailymotionIE(InfoExtractor):
 637     """Information Extractor for Dailymotion"""
 638
 639     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 640     IE_NAME = u'dailymotion'
 641
 642     def __init__(self, downloader=None):
 643         InfoExtractor.__init__(self, downloader)
 644
 645     def report_download_webpage(self, video_id):
 646         """Report webpage download."""
 647         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 648
 649     def report_extraction(self, video_id):
 650         """Report information extraction."""
 651         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 652
 653     def _real_extract(self, url):
 654         # Extract id and simplified title from URL
 655         mobj = re.match(self._VALID_URL, url)
 656         if mobj is None:
 657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 658             return
 659
 660         video_id = mobj.group(1).split('_')[0].split('?')[0]
 661
 662         video_extension = 'mp4'
 663
 664         # Retrieve video webpage to extract further information
 665         request = compat_urllib_request.Request(url)
 666         request.add_header('Cookie', 'family_filter=off')
 667         try:
 668             self.report_download_webpage(video_id)
 669             webpage_bytes = compat_urllib_request.urlopen(request).read()
 670             webpage = webpage_bytes.decode('utf-8')
 671         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 672             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 673             return
 674
 675         # Extract URL, uploader and title from webpage
 676         self.report_extraction(video_id)
 677         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 678         if mobj is None:
 679             self._downloader.trouble(u'ERROR: unable to extract media URL')
 680             return
 681         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 682
 683         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 684             if key in flashvars:
 685                 max_quality = key
 686                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 687                 break
 688         else:
 689             self._downloader.trouble(u'ERROR: unable to extract video URL')
 690             return
 691
 692         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 693         if mobj is None:
 694             self._downloader.trouble(u'ERROR: unable to extract video URL')
 695             return
 696
 697         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 698
 699         # TODO: support choosing qualities
 700
 701         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 702         if mobj is None:
 703             self._downloader.trouble(u'ERROR: unable to extract title')
 704             return
 705         video_title = unescapeHTML(mobj.group('title'))
 706
 707         video_uploader = None
 708         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 709         if mobj is None:
 710             # lookin for official user
 711             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 712             if mobj_official is None:
 713                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 714             else:
 715                 video_uploader = mobj_official.group(1)
 716         else:
 717             video_uploader = mobj.group(1)
 718
 719         video_upload_date = None
 720         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 721         if mobj is not None:
 722             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 723
 724         return [{
 725             'id':       video_id,
 726             'url':      video_url,
 727             'uploader': video_uploader,
 728             'upload_date':  video_upload_date,
 729             'title':    video_title,
 730             'ext':      video_extension,
 731         }]
 732
 733
 734 class PhotobucketIE(InfoExtractor):
 735     """Information extractor for photobucket.com."""
 736
 737     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 738     IE_NAME = u'photobucket'
 739
 740     def __init__(self, downloader=None):
 741         InfoExtractor.__init__(self, downloader)
 742
 743     def report_download_webpage(self, video_id):
 744         """Report webpage download."""
 745         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 746
 747     def report_extraction(self, video_id):
 748         """Report information extraction."""
 749         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 750
 751     def _real_extract(self, url):
 752         # Extract id from URL
 753         mobj = re.match(self._VALID_URL, url)
 754         if mobj is None:
 755             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 756             return
 757
 758         video_id = mobj.group(1)
 759
 760         video_extension = 'flv'
 761
 762         # Retrieve video webpage to extract further information
 763         request = compat_urllib_request.Request(url)
 764         try:
 765             self.report_download_webpage(video_id)
 766             webpage = compat_urllib_request.urlopen(request).read()
 767         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 768             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 769             return
 770
 771         # Extract URL, uploader, and title from webpage
 772         self.report_extraction(video_id)
 773         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 774         if mobj is None:
 775             self._downloader.trouble(u'ERROR: unable to extract media URL')
 776             return
 777         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 778
 779         video_url = mediaURL
 780
 781         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 782         if mobj is None:
 783             self._downloader.trouble(u'ERROR: unable to extract title')
 784             return
 785         video_title = mobj.group(1).decode('utf-8')
 786
 787         video_uploader = mobj.group(2).decode('utf-8')
 788
 789         return [{
 790             'id':       video_id.decode('utf-8'),
 791             'url':      video_url.decode('utf-8'),
 792             'uploader': video_uploader,
 793             'upload_date':  None,
 794             'title':    video_title,
 795             'ext':      video_extension.decode('utf-8'),
 796         }]
 797
 798
 799 class YahooIE(InfoExtractor):
 800     """Information extractor for video.yahoo.com."""
 801
 802     _WORKING = False
 803     # _VALID_URL matches all Yahoo! Video URLs
 804     # _VPAGE_URL matches only the extractable '/watch/' URLs
 805     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 806     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 807     IE_NAME = u'video.yahoo'
 808
 809     def __init__(self, downloader=None):
 810         InfoExtractor.__init__(self, downloader)
 811
 812     def report_download_webpage(self, video_id):
 813         """Report webpage download."""
 814         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 815
 816     def report_extraction(self, video_id):
 817         """Report information extraction."""
 818         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 819
 820     def _real_extract(self, url, new_video=True):
 821         # Extract ID from URL
 822         mobj = re.match(self._VALID_URL, url)
 823         if mobj is None:
 824             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 825             return
 826
 827         video_id = mobj.group(2)
 828         video_extension = 'flv'
 829
 830         # Rewrite valid but non-extractable URLs as
 831         # extractable English language /watch/ URLs
 832         if re.match(self._VPAGE_URL, url) is None:
 833             request = compat_urllib_request.Request(url)
 834             try:
 835                 webpage = compat_urllib_request.urlopen(request).read()
 836             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 837                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 838                 return
 839
 840             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 841             if mobj is None:
 842                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 843                 return
 844             yahoo_id = mobj.group(1)
 845
 846             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 847             if mobj is None:
 848                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 849                 return
 850             yahoo_vid = mobj.group(1)
 851
 852             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 853             return self._real_extract(url, new_video=False)
 854
 855         # Retrieve video webpage to extract further information
 856         request = compat_urllib_request.Request(url)
 857         try:
 858             self.report_download_webpage(video_id)
 859             webpage = compat_urllib_request.urlopen(request).read()
 860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 861             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 862             return
 863
 864         # Extract uploader and title from webpage
 865         self.report_extraction(video_id)
 866         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 867         if mobj is None:
 868             self._downloader.trouble(u'ERROR: unable to extract video title')
 869             return
 870         video_title = mobj.group(1).decode('utf-8')
 871
 872         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 873         if mobj is None:
 874             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 875             return
 876         video_uploader = mobj.group(1).decode('utf-8')
 877
 878         # Extract video thumbnail
 879         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 880         if mobj is None:
 881             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 882             return
 883         video_thumbnail = mobj.group(1).decode('utf-8')
 884
 885         # Extract video description
 886         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 887         if mobj is None:
 888             self._downloader.trouble(u'ERROR: unable to extract video description')
 889             return
 890         video_description = mobj.group(1).decode('utf-8')
 891         if not video_description:
 892             video_description = 'No description available.'
 893
 894         # Extract video height and width
 895         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 896         if mobj is None:
 897             self._downloader.trouble(u'ERROR: unable to extract video height')
 898             return
 899         yv_video_height = mobj.group(1)
 900
 901         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 902         if mobj is None:
 903             self._downloader.trouble(u'ERROR: unable to extract video width')
 904             return
 905         yv_video_width = mobj.group(1)
 906
 907         # Retrieve video playlist to extract media URL
 908         # I'm not completely sure what all these options are, but we
 909         # seem to need most of them, otherwise the server sends a 401.
 910         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 911         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 912         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 913                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 914                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 915         try:
 916             self.report_download_webpage(video_id)
 917             webpage = compat_urllib_request.urlopen(request).read()
 918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 919             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 920             return
 921
 922         # Extract media URL from playlist XML
 923         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 924         if mobj is None:
 925             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 926             return
 927         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 928         video_url = unescapeHTML(video_url)
 929
 930         return [{
 931             'id':       video_id.decode('utf-8'),
 932             'url':      video_url,
 933             'uploader': video_uploader,
 934             'upload_date':  None,
 935             'title':    video_title,
 936             'ext':      video_extension.decode('utf-8'),
 937             'thumbnail':    video_thumbnail.decode('utf-8'),
 938             'description':  video_description,
 939         }]
 940
 941
 942 class VimeoIE(InfoExtractor):
 943     """Information extractor for vimeo.com."""
 944
 945     # _VALID_URL matches Vimeo URLs
 946     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 947     IE_NAME = u'vimeo'
 948
 949     def __init__(self, downloader=None):
 950         InfoExtractor.__init__(self, downloader)
 951
 952     def report_download_webpage(self, video_id):
 953         """Report webpage download."""
 954         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 955
 956     def report_extraction(self, video_id):
 957         """Report information extraction."""
 958         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 959
 960     def _real_extract(self, url, new_video=True):
 961         # Extract ID from URL
 962         mobj = re.match(self._VALID_URL, url)
 963         if mobj is None:
 964             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 965             return
 966
 967         video_id = mobj.group(1)
 968
 969         # Retrieve video webpage to extract further information
 970         request = compat_urllib_request.Request(url, None, std_headers)
 971         try:
 972             self.report_download_webpage(video_id)
 973             webpage_bytes = compat_urllib_request.urlopen(request).read()
 974             webpage = webpage_bytes.decode('utf-8')
 975         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 976             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 977             return
 978
 979         # Now we begin extracting as much information as we can from what we
 980         # retrieved. First we extract the information common to all extractors,
 981         # and latter we extract those that are Vimeo specific.
 982         self.report_extraction(video_id)
 983
 984         # Extract the config JSON
 985         try:
 986             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 987             config = json.loads(config)
 988         except:
 989             self._downloader.trouble(u'ERROR: unable to extract info section')
 990             return
 991
 992         # Extract title
 993         video_title = config["video"]["title"]
 994
 995         # Extract uploader
 996         video_uploader = config["video"]["owner"]["name"]
 997
 998         # Extract video thumbnail
 999         video_thumbnail = config["video"]["thumbnail"]
1000
1001         # Extract video description
1002         video_description = get_element_by_id("description", webpage)
1003         if video_description: video_description = clean_html(video_description)
1004         else: video_description = ''
1005
1006         # Extract upload date
1007         video_upload_date = None
1008         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1009         if mobj is not None:
1010             video_upload_date = mobj.group(1)
1011
1012         # Vimeo specific: extract request signature and timestamp
1013         sig = config['request']['signature']
1014         timestamp = config['request']['timestamp']
1015
1016         # Vimeo specific: extract video codec and quality information
1017         # First consider quality, then codecs, then take everything
1018         # TODO bind to format param
1019         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1020         files = { 'hd': [], 'sd': [], 'other': []}
1021         for codec_name, codec_extension in codecs:
1022             if codec_name in config["video"]["files"]:
1023                 if 'hd' in config["video"]["files"][codec_name]:
1024                     files['hd'].append((codec_name, codec_extension, 'hd'))
1025                 elif 'sd' in config["video"]["files"][codec_name]:
1026                     files['sd'].append((codec_name, codec_extension, 'sd'))
1027                 else:
1028                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1029
1030         for quality in ('hd', 'sd', 'other'):
1031             if len(files[quality]) > 0:
1032                 video_quality = files[quality][0][2]
1033                 video_codec = files[quality][0][0]
1034                 video_extension = files[quality][0][1]
1035                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1036                 break
1037         else:
1038             self._downloader.trouble(u'ERROR: no known codec found')
1039             return
1040
1041         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1042                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1043
1044         return [{
1045             'id':       video_id,
1046             'url':      video_url,
1047             'uploader': video_uploader,
1048             'upload_date':  video_upload_date,
1049             'title':    video_title,
1050             'ext':      video_extension,
1051             'thumbnail':    video_thumbnail,
1052             'description':  video_description,
1053         }]
1054
1055
1056 class ArteTvIE(InfoExtractor):
1057     """arte.tv information extractor."""
1058
1059     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1060     _LIVE_URL = r'index-[0-9]+\.html$'
1061
1062     IE_NAME = u'arte.tv'
1063
1064     def __init__(self, downloader=None):
1065         InfoExtractor.__init__(self, downloader)
1066
1067     def report_download_webpage(self, video_id):
1068         """Report webpage download."""
1069         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1070
1071     def report_extraction(self, video_id):
1072         """Report information extraction."""
1073         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1074
1075     def fetch_webpage(self, url):
1076         self._downloader.increment_downloads()
1077         request = compat_urllib_request.Request(url)
1078         try:
1079             self.report_download_webpage(url)
1080             webpage = compat_urllib_request.urlopen(request).read()
1081         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1082             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1083             return
1084         except ValueError as err:
1085             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1086             return
1087         return webpage
1088
1089     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1090         page = self.fetch_webpage(url)
1091         mobj = re.search(regex, page, regexFlags)
1092         info = {}
1093
1094         if mobj is None:
1095             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1096             return
1097
1098         for (i, key, err) in matchTuples:
1099             if mobj.group(i) is None:
1100                 self._downloader.trouble(err)
1101                 return
1102             else:
1103                 info[key] = mobj.group(i)
1104
1105         return info
1106
1107     def extractLiveStream(self, url):
1108         video_lang = url.split('/')[-4]
1109         info = self.grep_webpage(
1110             url,
1111             r'src="(.*?/videothek_js.*?\.js)',
1112             0,
1113             [
1114                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1115             ]
1116         )
1117         http_host = url.split('/')[2]
1118         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1119         info = self.grep_webpage(
1120             next_url,
1121             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1122                 '(http://.*?\.swf).*?' +
1123                 '(rtmp://.*?)\'',
1124             re.DOTALL,
1125             [
1126                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1127                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1128                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1129             ]
1130         )
1131         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1132
1133     def extractPlus7Stream(self, url):
1134         video_lang = url.split('/')[-3]
1135         info = self.grep_webpage(
1136             url,
1137             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1138             0,
1139             [
1140                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1141             ]
1142         )
1143         next_url = compat_urllib_parse.unquote(info.get('url'))
1144         info = self.grep_webpage(
1145             next_url,
1146             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1147             0,
1148             [
1149                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1150             ]
1151         )
1152         next_url = compat_urllib_parse.unquote(info.get('url'))
1153
1154         info = self.grep_webpage(
1155             next_url,
1156             r'<video id="(.*?)".*?>.*?' +
1157                 '<name>(.*?)</name>.*?' +
1158                 '<dateVideo>(.*?)</dateVideo>.*?' +
1159                 '<url quality="hd">(.*?)</url>',
1160             re.DOTALL,
1161             [
1162                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1163                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1164                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1165                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1166             ]
1167         )
1168
1169         return {
1170             'id':           info.get('id'),
1171             'url':          compat_urllib_parse.unquote(info.get('url')),
1172             'uploader':     u'arte.tv',
1173             'upload_date':  info.get('date'),
1174             'title':        info.get('title').decode('utf-8'),
1175             'ext':          u'mp4',
1176             'format':       u'NA',
1177             'player_url':   None,
1178         }
1179
1180     def _real_extract(self, url):
1181         video_id = url.split('/')[-1]
1182         self.report_extraction(video_id)
1183
1184         if re.search(self._LIVE_URL, video_id) is not None:
1185             self.extractLiveStream(url)
1186             return
1187         else:
1188             info = self.extractPlus7Stream(url)
1189
1190         return [info]
1191
1192
1193 class GenericIE(InfoExtractor):
1194     """Generic last-resort information extractor."""
1195
1196     _VALID_URL = r'.*'
1197     IE_NAME = u'generic'
1198
1199     def __init__(self, downloader=None):
1200         InfoExtractor.__init__(self, downloader)
1201
1202     def report_download_webpage(self, video_id):
1203         """Report webpage download."""
1204         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1205         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1206
1207     def report_extraction(self, video_id):
1208         """Report information extraction."""
1209         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1210
1211     def report_following_redirect(self, new_url):
1212         """Report information extraction."""
1213         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1214
1215     def _test_redirect(self, url):
1216         """Check if it is a redirect, like url shorteners, in case restart chain."""
1217         class HeadRequest(compat_urllib_request.Request):
1218             def get_method(self):
1219                 return "HEAD"
1220
1221         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1222             """
1223             Subclass the HTTPRedirectHandler to make it use our
1224             HeadRequest also on the redirected URL
1225             """
1226             def redirect_request(self, req, fp, code, msg, headers, newurl):
1227                 if code in (301, 302, 303, 307):
1228                     newurl = newurl.replace(' ', '%20')
1229                     newheaders = dict((k,v) for k,v in req.headers.items()
1230                                       if k.lower() not in ("content-length", "content-type"))
1231                     return HeadRequest(newurl,
1232                                        headers=newheaders,
1233                                        origin_req_host=req.get_origin_req_host(),
1234                                        unverifiable=True)
1235                 else:
1236                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1237
1238         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1239             """
1240             Fallback to GET if HEAD is not allowed (405 HTTP error)
1241             """
1242             def http_error_405(self, req, fp, code, msg, headers):
1243                 fp.read()
1244                 fp.close()
1245
1246                 newheaders = dict((k,v) for k,v in req.headers.items()
1247                                   if k.lower() not in ("content-length", "content-type"))
1248                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1249                                                  headers=newheaders,
1250                                                  origin_req_host=req.get_origin_req_host(),
1251                                                  unverifiable=True))
1252
1253         # Build our opener
1254         opener = compat_urllib_request.OpenerDirector()
1255         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1256                         HTTPMethodFallback, HEADRedirectHandler,
1257                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1258             opener.add_handler(handler())
1259
1260         response = opener.open(HeadRequest(url))
1261         new_url = response.geturl()
1262
1263         if url == new_url:
1264             return False
1265
1266         self.report_following_redirect(new_url)
1267         self._downloader.download([new_url])
1268         return True
1269
1270     def _real_extract(self, url):
1271         if self._test_redirect(url): return
1272
1273         video_id = url.split('/')[-1]
1274         request = compat_urllib_request.Request(url)
1275         try:
1276             self.report_download_webpage(video_id)
1277             webpage = compat_urllib_request.urlopen(request).read()
1278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1280             return
1281         except ValueError as err:
1282             # since this is the last-resort InfoExtractor, if
1283             # this error is thrown, it'll be thrown here
1284             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1285             return
1286
1287         self.report_extraction(video_id)
1288         # Start with something easy: JW Player in SWFObject
1289         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1290         if mobj is None:
1291             # Broaden the search a little bit
1292             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1293         if mobj is None:
1294             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1295             return
1296
1297         # It's possible that one of the regexes
1298         # matched, but returned an empty group:
1299         if mobj.group(1) is None:
1300             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1301             return
1302
1303         video_url = compat_urllib_parse.unquote(mobj.group(1))
1304         video_id = os.path.basename(video_url)
1305
1306         # here's a fun little line of code for you:
1307         video_extension = os.path.splitext(video_id)[1][1:]
1308         video_id = os.path.splitext(video_id)[0]
1309
1310         # it's tempting to parse this further, but you would
1311         # have to take into account all the variations like
1312         #   Video Title - Site Name
1313         #   Site Name | Video Title
1314         #   Video Title - Tagline | Site Name
1315         # and so on and so forth; it's just not practical
1316         mobj = re.search(r'<title>(.*)</title>', webpage)
1317         if mobj is None:
1318             self._downloader.trouble(u'ERROR: unable to extract title')
1319             return
1320         video_title = mobj.group(1)
1321
1322         # video uploader is domain name
1323         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1324         if mobj is None:
1325             self._downloader.trouble(u'ERROR: unable to extract title')
1326             return
1327         video_uploader = mobj.group(1)
1328
1329         return [{
1330             'id':       video_id,
1331             'url':      video_url,
1332             'uploader': video_uploader,
1333             'upload_date':  None,
1334             'title':    video_title,
1335             'ext':      video_extension,
1336         }]
1337
1338
1339 class YoutubeSearchIE(InfoExtractor):
1340     """Information Extractor for YouTube search queries."""
1341     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343     _max_youtube_results = 1000
1344     IE_NAME = u'youtube:search'
1345
1346     def __init__(self, downloader=None):
1347         InfoExtractor.__init__(self, downloader)
1348
1349     def report_download_page(self, query, pagenum):
1350         """Report attempt to download search page with given number."""
1351         query = query.decode(preferredencoding())
1352         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1353
1354     def _real_extract(self, query):
1355         mobj = re.match(self._VALID_URL, query)
1356         if mobj is None:
1357             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1358             return
1359
1360         prefix, query = query.split(':')
1361         prefix = prefix[8:]
1362         query = query.encode('utf-8')
1363         if prefix == '':
1364             self._download_n_results(query, 1)
1365             return
1366         elif prefix == 'all':
1367             self._download_n_results(query, self._max_youtube_results)
1368             return
1369         else:
1370             try:
1371                 n = int(prefix)
1372                 if n <= 0:
1373                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1374                     return
1375                 elif n > self._max_youtube_results:
1376                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1377                     n = self._max_youtube_results
1378                 self._download_n_results(query, n)
1379                 return
1380             except ValueError: # parsing prefix as integer fails
1381                 self._download_n_results(query, 1)
1382                 return
1383
1384     def _download_n_results(self, query, n):
1385         """Downloads a specified number of results for a query"""
1386
1387         video_ids = []
1388         pagenum = 0
1389         limit = n
1390
1391         while (50 * pagenum) < limit:
1392             self.report_download_page(query, pagenum+1)
1393             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1394             request = compat_urllib_request.Request(result_url)
1395             try:
1396                 data = compat_urllib_request.urlopen(request).read()
1397             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1398                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1399                 return
1400             api_response = json.loads(data)['data']
1401
1402             new_ids = list(video['id'] for video in api_response['items'])
1403             video_ids += new_ids
1404
1405             limit = min(n, api_response['totalItems'])
1406             pagenum += 1
1407
1408         if len(video_ids) > n:
1409             video_ids = video_ids[:n]
1410         for id in video_ids:
1411             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1412         return
1413
1414
1415 class GoogleSearchIE(InfoExtractor):
1416     """Information Extractor for Google Video search queries."""
1417     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1418     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1419     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1420     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1421     _max_google_results = 1000
1422     IE_NAME = u'video.google:search'
1423
1424     def __init__(self, downloader=None):
1425         InfoExtractor.__init__(self, downloader)
1426
1427     def report_download_page(self, query, pagenum):
1428         """Report attempt to download playlist page with given number."""
1429         query = query.decode(preferredencoding())
1430         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1431
1432     def _real_extract(self, query):
1433         mobj = re.match(self._VALID_URL, query)
1434         if mobj is None:
1435             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1436             return
1437
1438         prefix, query = query.split(':')
1439         prefix = prefix[8:]
1440         query = query.encode('utf-8')
1441         if prefix == '':
1442             self._download_n_results(query, 1)
1443             return
1444         elif prefix == 'all':
1445             self._download_n_results(query, self._max_google_results)
1446             return
1447         else:
1448             try:
1449                 n = int(prefix)
1450                 if n <= 0:
1451                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1452                     return
1453                 elif n > self._max_google_results:
1454                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1455                     n = self._max_google_results
1456                 self._download_n_results(query, n)
1457                 return
1458             except ValueError: # parsing prefix as integer fails
1459                 self._download_n_results(query, 1)
1460                 return
1461
1462     def _download_n_results(self, query, n):
1463         """Downloads a specified number of results for a query"""
1464
1465         video_ids = []
1466         pagenum = 0
1467
1468         while True:
1469             self.report_download_page(query, pagenum)
1470             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1471             request = compat_urllib_request.Request(result_url)
1472             try:
1473                 page = compat_urllib_request.urlopen(request).read()
1474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1476                 return
1477
1478             # Extract video identifiers
1479             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1480                 video_id = mobj.group(1)
1481                 if video_id not in video_ids:
1482                     video_ids.append(video_id)
1483                     if len(video_ids) == n:
1484                         # Specified n videos reached
1485                         for id in video_ids:
1486                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1487                         return
1488
1489             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1490                 for id in video_ids:
1491                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1492                 return
1493
1494             pagenum = pagenum + 1
1495
1496
1497 class YahooSearchIE(InfoExtractor):
1498     """Information Extractor for Yahoo! Video search queries."""
1499
1500     _WORKING = False
1501     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1502     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1503     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1504     _MORE_PAGES_INDICATOR = r'\s*Next'
1505     _max_yahoo_results = 1000
1506     IE_NAME = u'video.yahoo:search'
1507
1508     def __init__(self, downloader=None):
1509         InfoExtractor.__init__(self, downloader)
1510
1511     def report_download_page(self, query, pagenum):
1512         """Report attempt to download playlist page with given number."""
1513         query = query.decode(preferredencoding())
1514         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1515
1516     def _real_extract(self, query):
1517         mobj = re.match(self._VALID_URL, query)
1518         if mobj is None:
1519             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1520             return
1521
1522         prefix, query = query.split(':')
1523         prefix = prefix[8:]
1524         query = query.encode('utf-8')
1525         if prefix == '':
1526             self._download_n_results(query, 1)
1527             return
1528         elif prefix == 'all':
1529             self._download_n_results(query, self._max_yahoo_results)
1530             return
1531         else:
1532             try:
1533                 n = int(prefix)
1534                 if n <= 0:
1535                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1536                     return
1537                 elif n > self._max_yahoo_results:
1538                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1539                     n = self._max_yahoo_results
1540                 self._download_n_results(query, n)
1541                 return
1542             except ValueError: # parsing prefix as integer fails
1543                 self._download_n_results(query, 1)
1544                 return
1545
1546     def _download_n_results(self, query, n):
1547         """Downloads a specified number of results for a query"""
1548
1549         video_ids = []
1550         already_seen = set()
1551         pagenum = 1
1552
1553         while True:
1554             self.report_download_page(query, pagenum)
1555             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1556             request = compat_urllib_request.Request(result_url)
1557             try:
1558                 page = compat_urllib_request.urlopen(request).read()
1559             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1561                 return
1562
1563             # Extract video identifiers
1564             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565                 video_id = mobj.group(1)
1566                 if video_id not in already_seen:
1567                     video_ids.append(video_id)
1568                     already_seen.add(video_id)
1569                     if len(video_ids) == n:
1570                         # Specified n videos reached
1571                         for id in video_ids:
1572                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1573                         return
1574
1575             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1576                 for id in video_ids:
1577                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1578                 return
1579
1580             pagenum = pagenum + 1
1581
1582
1583 class YoutubePlaylistIE(InfoExtractor):
1584     """Information Extractor for YouTube playlists."""
1585
1586     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1587     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1588     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1589     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1590     IE_NAME = u'youtube:playlist'
1591
1592     def __init__(self, downloader=None):
1593         InfoExtractor.__init__(self, downloader)
1594
1595     def report_download_page(self, playlist_id, pagenum):
1596         """Report attempt to download playlist page with given number."""
1597         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1598
1599     def _real_extract(self, url):
1600         # Extract playlist id
1601         mobj = re.match(self._VALID_URL, url)
1602         if mobj is None:
1603             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1604             return
1605
1606         # Single video case
1607         if mobj.group(3) is not None:
1608             self._downloader.download([mobj.group(3)])
1609             return
1610
1611         # Download playlist pages
1612         # prefix is 'p' as default for playlists but there are other types that need extra care
1613         playlist_prefix = mobj.group(1)
1614         if playlist_prefix == 'a':
1615             playlist_access = 'artist'
1616         else:
1617             playlist_prefix = 'p'
1618             playlist_access = 'view_play_list'
1619         playlist_id = mobj.group(2)
1620         video_ids = []
1621         pagenum = 1
1622
1623         while True:
1624             self.report_download_page(playlist_id, pagenum)
1625             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1626             request = compat_urllib_request.Request(url)
1627             try:
1628                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1629             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1631                 return
1632
1633             # Extract video identifiers
1634             ids_in_page = []
1635             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1636                 if mobj.group(1) not in ids_in_page:
1637                     ids_in_page.append(mobj.group(1))
1638             video_ids.extend(ids_in_page)
1639
1640             if self._MORE_PAGES_INDICATOR not in page:
1641                 break
1642             pagenum = pagenum + 1
1643
1644         total = len(video_ids)
1645
1646         playliststart = self._downloader.params.get('playliststart', 1) - 1
1647         playlistend = self._downloader.params.get('playlistend', -1)
1648         if playlistend == -1:
1649             video_ids = video_ids[playliststart:]
1650         else:
1651             video_ids = video_ids[playliststart:playlistend]
1652
1653         if len(video_ids) == total:
1654             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1655         else:
1656             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1657
1658         for id in video_ids:
1659             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1660         return
1661
1662
1663 class YoutubeChannelIE(InfoExtractor):
1664     """Information Extractor for YouTube channels."""
1665
1666     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1667     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1668     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669     IE_NAME = u'youtube:channel'
1670
1671     def report_download_page(self, channel_id, pagenum):
1672         """Report attempt to download channel page with given number."""
1673         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1674
1675     def _real_extract(self, url):
1676         # Extract channel id
1677         mobj = re.match(self._VALID_URL, url)
1678         if mobj is None:
1679             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1680             return
1681
1682         # Download channel pages
1683         channel_id = mobj.group(1)
1684         video_ids = []
1685         pagenum = 1
1686
1687         while True:
1688             self.report_download_page(channel_id, pagenum)
1689             url = self._TEMPLATE_URL % (channel_id, pagenum)
1690             request = compat_urllib_request.Request(url)
1691             try:
1692                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1693             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1694                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1695                 return
1696
1697             # Extract video identifiers
1698             ids_in_page = []
1699             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1700                 if mobj.group(1) not in ids_in_page:
1701                     ids_in_page.append(mobj.group(1))
1702             video_ids.extend(ids_in_page)
1703
1704             if self._MORE_PAGES_INDICATOR not in page:
1705                 break
1706             pagenum = pagenum + 1
1707
1708         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1709
1710         for id in video_ids:
1711             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1712         return
1713
1714
1715 class YoutubeUserIE(InfoExtractor):
1716     """Information Extractor for YouTube users."""
1717
1718     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1719     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1720     _GDATA_PAGE_SIZE = 50
1721     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1722     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1723     IE_NAME = u'youtube:user'
1724
1725     def __init__(self, downloader=None):
1726         InfoExtractor.__init__(self, downloader)
1727
1728     def report_download_page(self, username, start_index):
1729         """Report attempt to download user page."""
1730         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1731                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1732
1733     def _real_extract(self, url):
1734         # Extract username
1735         mobj = re.match(self._VALID_URL, url)
1736         if mobj is None:
1737             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1738             return
1739
1740         username = mobj.group(1)
1741
1742         # Download video ids using YouTube Data API. Result size per
1743         # query is limited (currently to 50 videos) so we need to query
1744         # page by page until there are no video ids - it means we got
1745         # all of them.
1746
1747         video_ids = []
1748         pagenum = 0
1749
1750         while True:
1751             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1752             self.report_download_page(username, start_index)
1753
1754             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1755
1756             try:
1757                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1758             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1759                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1760                 return
1761
1762             # Extract video identifiers
1763             ids_in_page = []
1764
1765             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1766                 if mobj.group(1) not in ids_in_page:
1767                     ids_in_page.append(mobj.group(1))
1768
1769             video_ids.extend(ids_in_page)
1770
1771             # A little optimization - if current page is not
1772             # "full", ie. does not contain PAGE_SIZE video ids then
1773             # we can assume that this page is the last one - there
1774             # are no more ids on further pages - no need to query
1775             # again.
1776
1777             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1778                 break
1779
1780             pagenum += 1
1781
1782         all_ids_count = len(video_ids)
1783         playliststart = self._downloader.params.get('playliststart', 1) - 1
1784         playlistend = self._downloader.params.get('playlistend', -1)
1785
1786         if playlistend == -1:
1787             video_ids = video_ids[playliststart:]
1788         else:
1789             video_ids = video_ids[playliststart:playlistend]
1790
1791         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1792                 (username, all_ids_count, len(video_ids)))
1793
1794         for video_id in video_ids:
1795             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1796
1797
1798 class BlipTVUserIE(InfoExtractor):
1799     """Information Extractor for blip.tv users."""
1800
1801     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1802     _PAGE_SIZE = 12
1803     IE_NAME = u'blip.tv:user'
1804
1805     def __init__(self, downloader=None):
1806         InfoExtractor.__init__(self, downloader)
1807
1808     def report_download_page(self, username, pagenum):
1809         """Report attempt to download user page."""
1810         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1811                 (self.IE_NAME, username, pagenum))
1812
1813     def _real_extract(self, url):
1814         # Extract username
1815         mobj = re.match(self._VALID_URL, url)
1816         if mobj is None:
1817             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1818             return
1819
1820         username = mobj.group(1)
1821
1822         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1823
1824         request = compat_urllib_request.Request(url)
1825
1826         try:
1827             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1828             mobj = re.search(r'data-users-id="([^"]+)"', page)
1829             page_base = page_base % mobj.group(1)
1830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1831             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1832             return
1833
1834
1835         # Download video ids using BlipTV Ajax calls. Result size per
1836         # query is limited (currently to 12 videos) so we need to query
1837         # page by page until there are no video ids - it means we got
1838         # all of them.
1839
1840         video_ids = []
1841         pagenum = 1
1842
1843         while True:
1844             self.report_download_page(username, pagenum)
1845
1846             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1847
1848             try:
1849                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1850             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1852                 return
1853
1854             # Extract video identifiers
1855             ids_in_page = []
1856
1857             for mobj in re.finditer(r'href="/([^"]+)"', page):
1858                 if mobj.group(1) not in ids_in_page:
1859                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1860
1861             video_ids.extend(ids_in_page)
1862
1863             # A little optimization - if current page is not
1864             # "full", ie. does not contain PAGE_SIZE video ids then
1865             # we can assume that this page is the last one - there
1866             # are no more ids on further pages - no need to query
1867             # again.
1868
1869             if len(ids_in_page) < self._PAGE_SIZE:
1870                 break
1871
1872             pagenum += 1
1873
1874         all_ids_count = len(video_ids)
1875         playliststart = self._downloader.params.get('playliststart', 1) - 1
1876         playlistend = self._downloader.params.get('playlistend', -1)
1877
1878         if playlistend == -1:
1879             video_ids = video_ids[playliststart:]
1880         else:
1881             video_ids = video_ids[playliststart:playlistend]
1882
1883         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1884                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1885
1886         for video_id in video_ids:
1887             self._downloader.download([u'http://blip.tv/'+video_id])
1888
1889
1890 class DepositFilesIE(InfoExtractor):
1891     """Information extractor for depositfiles.com"""
1892
1893     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1894     IE_NAME = u'DepositFiles'
1895
1896     def __init__(self, downloader=None):
1897         InfoExtractor.__init__(self, downloader)
1898
1899     def report_download_webpage(self, file_id):
1900         """Report webpage download."""
1901         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1902
1903     def report_extraction(self, file_id):
1904         """Report information extraction."""
1905         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1906
1907     def _real_extract(self, url):
1908         file_id = url.split('/')[-1]
1909         # Rebuild url in english locale
1910         url = 'http://depositfiles.com/en/files/' + file_id
1911
1912         # Retrieve file webpage with 'Free download' button pressed
1913         free_download_indication = { 'gateway_result' : '1' }
1914         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1915         try:
1916             self.report_download_webpage(file_id)
1917             webpage = compat_urllib_request.urlopen(request).read()
1918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1920             return
1921
1922         # Search for the real file URL
1923         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1924         if (mobj is None) or (mobj.group(1) is None):
1925             # Try to figure out reason of the error.
1926             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1927             if (mobj is not None) and (mobj.group(1) is not None):
1928                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1929                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1930             else:
1931                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1932             return
1933
1934         file_url = mobj.group(1)
1935         file_extension = os.path.splitext(file_url)[1][1:]
1936
1937         # Search for file title
1938         mobj = re.search(r'<b title="(.*?)">', webpage)
1939         if mobj is None:
1940             self._downloader.trouble(u'ERROR: unable to extract title')
1941             return
1942         file_title = mobj.group(1).decode('utf-8')
1943
1944         return [{
1945             'id':       file_id.decode('utf-8'),
1946             'url':      file_url.decode('utf-8'),
1947             'uploader': None,
1948             'upload_date':  None,
1949             'title':    file_title,
1950             'ext':      file_extension.decode('utf-8'),
1951         }]
1952
1953
1954 class FacebookIE(InfoExtractor):
1955     """Information Extractor for Facebook"""
1956
1957     _WORKING = False
1958     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1959     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1960     _NETRC_MACHINE = 'facebook'
1961     _available_formats = ['video', 'highqual', 'lowqual']
1962     _video_extensions = {
1963         'video': 'mp4',
1964         'highqual': 'mp4',
1965         'lowqual': 'mp4',
1966     }
1967     IE_NAME = u'facebook'
1968
1969     def __init__(self, downloader=None):
1970         InfoExtractor.__init__(self, downloader)
1971
1972     def _reporter(self, message):
1973         """Add header and report message."""
1974         self._downloader.to_screen(u'[facebook] %s' % message)
1975
1976     def report_login(self):
1977         """Report attempt to log in."""
1978         self._reporter(u'Logging in')
1979
1980     def report_video_webpage_download(self, video_id):
1981         """Report attempt to download video webpage."""
1982         self._reporter(u'%s: Downloading video webpage' % video_id)
1983
1984     def report_information_extraction(self, video_id):
1985         """Report attempt to extract video information."""
1986         self._reporter(u'%s: Extracting video information' % video_id)
1987
1988     def _parse_page(self, video_webpage):
1989         """Extract video information from page"""
1990         # General data
1991         data = {'title': r'\("video_title", "(.*?)"\)',
1992             'description': r'<div class="datawrap">(.*?)</div>',
1993             'owner': r'\("video_owner_name", "(.*?)"\)',
1994             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1995             }
1996         video_info = {}
1997         for piece in data.keys():
1998             mobj = re.search(data[piece], video_webpage)
1999             if mobj is not None:
2000                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2001
2002         # Video urls
2003         video_urls = {}
2004         for fmt in self._available_formats:
2005             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2006             if mobj is not None:
2007                 # URL is in a Javascript segment inside an escaped Unicode format within
2008                 # the generally utf-8 page
2009                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2010         video_info['video_urls'] = video_urls
2011
2012         return video_info
2013
2014     def _real_initialize(self):
2015         if self._downloader is None:
2016             return
2017
2018         useremail = None
2019         password = None
2020         downloader_params = self._downloader.params
2021
2022         # Attempt to use provided username and password or .netrc data
2023         if downloader_params.get('username', None) is not None:
2024             useremail = downloader_params['username']
2025             password = downloader_params['password']
2026         elif downloader_params.get('usenetrc', False):
2027             try:
2028                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2029                 if info is not None:
2030                     useremail = info[0]
2031                     password = info[2]
2032                 else:
2033                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2034             except (IOError, netrc.NetrcParseError) as err:
2035                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2036                 return
2037
2038         if useremail is None:
2039             return
2040
2041         # Log in
2042         login_form = {
2043             'email': useremail,
2044             'pass': password,
2045             'login': 'Log+In'
2046             }
2047         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2048         try:
2049             self.report_login()
2050             login_results = compat_urllib_request.urlopen(request).read()
2051             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2052                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2053                 return
2054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2056             return
2057
2058     def _real_extract(self, url):
2059         mobj = re.match(self._VALID_URL, url)
2060         if mobj is None:
2061             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2062             return
2063         video_id = mobj.group('ID')
2064
2065         # Get video webpage
2066         self.report_video_webpage_download(video_id)
2067         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2068         try:
2069             page = compat_urllib_request.urlopen(request)
2070             video_webpage = page.read()
2071         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2073             return
2074
2075         # Start extracting information
2076         self.report_information_extraction(video_id)
2077
2078         # Extract information
2079         video_info = self._parse_page(video_webpage)
2080
2081         # uploader
2082         if 'owner' not in video_info:
2083             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2084             return
2085         video_uploader = video_info['owner']
2086
2087         # title
2088         if 'title' not in video_info:
2089             self._downloader.trouble(u'ERROR: unable to extract video title')
2090             return
2091         video_title = video_info['title']
2092         video_title = video_title.decode('utf-8')
2093
2094         # thumbnail image
2095         if 'thumbnail' not in video_info:
2096             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2097             video_thumbnail = ''
2098         else:
2099             video_thumbnail = video_info['thumbnail']
2100
2101         # upload date
2102         upload_date = None
2103         if 'upload_date' in video_info:
2104             upload_time = video_info['upload_date']
2105             timetuple = email.utils.parsedate_tz(upload_time)
2106             if timetuple is not None:
2107                 try:
2108                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2109                 except:
2110                     pass
2111
2112         # description
2113         video_description = video_info.get('description', 'No description available.')
2114
2115         url_map = video_info['video_urls']
2116         if len(url_map.keys()) > 0:
2117             # Decide which formats to download
2118             req_format = self._downloader.params.get('format', None)
2119             format_limit = self._downloader.params.get('format_limit', None)
2120
2121             if format_limit is not None and format_limit in self._available_formats:
2122                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2123             else:
2124                 format_list = self._available_formats
2125             existing_formats = [x for x in format_list if x in url_map]
2126             if len(existing_formats) == 0:
2127                 self._downloader.trouble(u'ERROR: no known formats available for video')
2128                 return
2129             if req_format is None:
2130                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2131             elif req_format == 'worst':
2132                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2133             elif req_format == '-1':
2134                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2135             else:
2136                 # Specific format
2137                 if req_format not in url_map:
2138                     self._downloader.trouble(u'ERROR: requested format not available')
2139                     return
2140                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2141
2142         results = []
2143         for format_param, video_real_url in video_url_list:
2144             # Extension
2145             video_extension = self._video_extensions.get(format_param, 'mp4')
2146
2147             results.append({
2148                 'id':       video_id.decode('utf-8'),
2149                 'url':      video_real_url.decode('utf-8'),
2150                 'uploader': video_uploader.decode('utf-8'),
2151                 'upload_date':  upload_date,
2152                 'title':    video_title,
2153                 'ext':      video_extension.decode('utf-8'),
2154                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2155                 'thumbnail':    video_thumbnail.decode('utf-8'),
2156                 'description':  video_description.decode('utf-8'),
2157             })
2158         return results
2159
2160 class BlipTVIE(InfoExtractor):
2161     """Information extractor for blip.tv"""
2162
2163     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2164     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2165     IE_NAME = u'blip.tv'
2166
2167     def report_extraction(self, file_id):
2168         """Report information extraction."""
2169         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2170
2171     def report_direct_download(self, title):
2172         """Report information extraction."""
2173         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2174
2175     def _real_extract(self, url):
2176         mobj = re.match(self._VALID_URL, url)
2177         if mobj is None:
2178             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2179             return
2180
2181         if '?' in url:
2182             cchar = '&'
2183         else:
2184             cchar = '?'
2185         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186         request = compat_urllib_request.Request(json_url)
2187         self.report_extraction(mobj.group(1))
2188         info = None
2189         try:
2190             urlh = compat_urllib_request.urlopen(request)
2191             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2192                 basename = url.split('/')[-1]
2193                 title,ext = os.path.splitext(basename)
2194                 title = title.decode('UTF-8')
2195                 ext = ext.replace('.', '')
2196                 self.report_direct_download(title)
2197                 info = {
2198                     'id': title,
2199                     'url': url,
2200                     'uploader': None,
2201                     'upload_date': None,
2202                     'title': title,
2203                     'ext': ext,
2204                     'urlhandle': urlh
2205                 }
2206         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2207             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2208             return
2209         if info is None: # Regular URL
2210             try:
2211                 json_code_bytes = urlh.read()
2212                 json_code = json_code_bytes.decode('utf-8')
2213             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2215                 return
2216
2217             try:
2218                 json_data = json.loads(json_code)
2219                 if 'Post' in json_data:
2220                     data = json_data['Post']
2221                 else:
2222                     data = json_data
2223
2224                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225                 video_url = data['media']['url']
2226                 umobj = re.match(self._URL_EXT, video_url)
2227                 if umobj is None:
2228                     raise ValueError('Can not determine filename extension')
2229                 ext = umobj.group(1)
2230
2231                 info = {
2232                     'id': data['item_id'],
2233                     'url': video_url,
2234                     'uploader': data['display_name'],
2235                     'upload_date': upload_date,
2236                     'title': data['title'],
2237                     'ext': ext,
2238                     'format': data['media']['mimeType'],
2239                     'thumbnail': data['thumbnailUrl'],
2240                     'description': data['description'],
2241                     'player_url': data['embedUrl']
2242                 }
2243             except (ValueError,KeyError) as err:
2244                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2245                 return
2246
2247         std_headers['User-Agent'] = 'iTunes/10.6.1'
2248         return [info]
2249
2250
2251 class MyVideoIE(InfoExtractor):
2252     """Information Extractor for myvideo.de."""
2253
2254     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255     IE_NAME = u'myvideo'
2256
2257     def __init__(self, downloader=None):
2258         InfoExtractor.__init__(self, downloader)
2259
2260     def report_download_webpage(self, video_id):
2261         """Report webpage download."""
2262         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2263
2264     def report_extraction(self, video_id):
2265         """Report information extraction."""
2266         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2267
2268     def _real_extract(self,url):
2269         mobj = re.match(self._VALID_URL, url)
2270         if mobj is None:
2271             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2272             return
2273
2274         video_id = mobj.group(1)
2275
2276         # Get video webpage
2277         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2278         try:
2279             self.report_download_webpage(video_id)
2280             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2281         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2282             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2283             return
2284
2285         self.report_extraction(video_id)
2286         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2287                  webpage)
2288         if mobj is None:
2289             self._downloader.trouble(u'ERROR: unable to extract media URL')
2290             return
2291         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2292
2293         mobj = re.search('<title>([^<]+)</title>', webpage)
2294         if mobj is None:
2295             self._downloader.trouble(u'ERROR: unable to extract title')
2296             return
2297
2298         video_title = mobj.group(1)
2299
2300         return [{
2301             'id':       video_id,
2302             'url':      video_url,
2303             'uploader': None,
2304             'upload_date':  None,
2305             'title':    video_title,
2306             'ext':      u'flv',
2307         }]
2308
2309 class ComedyCentralIE(InfoExtractor):
2310     """Information extractor for The Daily Show and Colbert Report """
2311
2312     # urls can be abbreviations like :thedailyshow or :colbert
2313     # urls for episodes like:
2314     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2315     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2316     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2317     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2318                       |(https?://)?(www\.)?
2319                           (?P<showname>thedailyshow|colbertnation)\.com/
2320                          (full-episodes/(?P<episode>.*)|
2321                           (?P<clip>
2322                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2323                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2324                      $"""
2325     IE_NAME = u'comedycentral'
2326
2327     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2328
2329     _video_extensions = {
2330         '3500': 'mp4',
2331         '2200': 'mp4',
2332         '1700': 'mp4',
2333         '1200': 'mp4',
2334         '750': 'mp4',
2335         '400': 'mp4',
2336     }
2337     _video_dimensions = {
2338         '3500': '1280x720',
2339         '2200': '960x540',
2340         '1700': '768x432',
2341         '1200': '640x360',
2342         '750': '512x288',
2343         '400': '384x216',
2344     }
2345
2346     def suitable(self, url):
2347         """Receives a URL and returns True if suitable for this IE."""
2348         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2349
2350     def report_extraction(self, episode_id):
2351         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2352
2353     def report_config_download(self, episode_id):
2354         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2355
2356     def report_index_download(self, episode_id):
2357         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2358
2359     def report_player_url(self, episode_id):
2360         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2361
2362
2363     def _print_formats(self, formats):
2364         print('Available formats:')
2365         for x in formats:
2366             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2367
2368
2369     def _real_extract(self, url):
2370         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2371         if mobj is None:
2372             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2373             return
2374
2375         if mobj.group('shortname'):
2376             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2377                 url = u'http://www.thedailyshow.com/full-episodes/'
2378             else:
2379                 url = u'http://www.colbertnation.com/full-episodes/'
2380             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2381             assert mobj is not None
2382
2383         if mobj.group('clip'):
2384             if mobj.group('showname') == 'thedailyshow':
2385                 epTitle = mobj.group('tdstitle')
2386             else:
2387                 epTitle = mobj.group('cntitle')
2388             dlNewest = False
2389         else:
2390             dlNewest = not mobj.group('episode')
2391             if dlNewest:
2392                 epTitle = mobj.group('showname')
2393             else:
2394                 epTitle = mobj.group('episode')
2395
2396         req = compat_urllib_request.Request(url)
2397         self.report_extraction(epTitle)
2398         try:
2399             htmlHandle = compat_urllib_request.urlopen(req)
2400             html = htmlHandle.read()
2401         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2403             return
2404         if dlNewest:
2405             url = htmlHandle.geturl()
2406             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2407             if mobj is None:
2408                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2409                 return
2410             if mobj.group('episode') == '':
2411                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2412                 return
2413             epTitle = mobj.group('episode')
2414
2415         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2416
2417         if len(mMovieParams) == 0:
2418             # The Colbert Report embeds the information in a without
2419             # a URL prefix; so extract the alternate reference
2420             # and then add the URL prefix manually.
2421
2422             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2423             if len(altMovieParams) == 0:
2424                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2425                 return
2426             else:
2427                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2428
2429         playerUrl_raw = mMovieParams[0][0]
2430         self.report_player_url(epTitle)
2431         try:
2432             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2433             playerUrl = urlHandle.geturl()
2434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2435             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2436             return
2437
2438         uri = mMovieParams[0][1]
2439         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2440         self.report_index_download(epTitle)
2441         try:
2442             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2445             return
2446
2447         results = []
2448
2449         idoc = xml.etree.ElementTree.fromstring(indexXml)
2450         itemEls = idoc.findall('.//item')
2451         for itemEl in itemEls:
2452             mediaId = itemEl.findall('./guid')[0].text
2453             shortMediaId = mediaId.split(':')[-1]
2454             showId = mediaId.split(':')[-2].replace('.com', '')
2455             officialTitle = itemEl.findall('./title')[0].text
2456             officialDate = itemEl.findall('./pubDate')[0].text
2457
2458             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2459                         compat_urllib_parse.urlencode({'uri': mediaId}))
2460             configReq = compat_urllib_request.Request(configUrl)
2461             self.report_config_download(epTitle)
2462             try:
2463                 configXml = compat_urllib_request.urlopen(configReq).read()
2464             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2466                 return
2467
2468             cdoc = xml.etree.ElementTree.fromstring(configXml)
2469             turls = []
2470             for rendition in cdoc.findall('.//rendition'):
2471                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2472                 turls.append(finfo)
2473
2474             if len(turls) == 0:
2475                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2476                 continue
2477
2478             if self._downloader.params.get('listformats', None):
2479                 self._print_formats([i[0] for i in turls])
2480                 return
2481
2482             # For now, just pick the highest bitrate
2483             format,video_url = turls[-1]
2484
2485             # Get the format arg from the arg stream
2486             req_format = self._downloader.params.get('format', None)
2487
2488             # Select format if we can find one
2489             for f,v in turls:
2490                 if f == req_format:
2491                     format, video_url = f, v
2492                     break
2493
2494             # Patch to download from alternative CDN, which does not
2495             # break on current RTMPDump builds
2496             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2497             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2498
2499             if video_url.startswith(broken_cdn):
2500                 video_url = video_url.replace(broken_cdn, better_cdn)
2501
2502             effTitle = showId + u'-' + epTitle
2503             info = {
2504                 'id': shortMediaId,
2505                 'url': video_url,
2506                 'uploader': showId,
2507                 'upload_date': officialDate,
2508                 'title': effTitle,
2509                 'ext': 'mp4',
2510                 'format': format,
2511                 'thumbnail': None,
2512                 'description': officialTitle,
2513                 'player_url': None #playerUrl
2514             }
2515
2516             results.append(info)
2517
2518         return results
2519
2520
2521 class EscapistIE(InfoExtractor):
2522     """Information extractor for The Escapist """
2523
2524     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2525     IE_NAME = u'escapist'
2526
2527     def report_extraction(self, showName):
2528         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2529
2530     def report_config_download(self, showName):
2531         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2532
2533     def _real_extract(self, url):
2534         mobj = re.match(self._VALID_URL, url)
2535         if mobj is None:
2536             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2537             return
2538         showName = mobj.group('showname')
2539         videoId = mobj.group('episode')
2540
2541         self.report_extraction(showName)
2542         try:
2543             webPage = compat_urllib_request.urlopen(url)
2544             webPageBytes = webPage.read()
2545             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2546             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2547         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2549             return
2550
2551         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2552         description = unescapeHTML(descMatch.group(1))
2553         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2554         imgUrl = unescapeHTML(imgMatch.group(1))
2555         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2556         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2557         configUrlMatch = re.search('config=(.*)$', playerUrl)
2558         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2559
2560         self.report_config_download(showName)
2561         try:
2562             configJSON = compat_urllib_request.urlopen(configUrl)
2563             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2564             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2565         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2567             return
2568
2569         # Technically, it's JavaScript, not JSON
2570         configJSON = configJSON.replace("'", '"')
2571
2572         try:
2573             config = json.loads(configJSON)
2574         except (ValueError,) as err:
2575             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2576             return
2577
2578         playlist = config['playlist']
2579         videoUrl = playlist[1]['url']
2580
2581         info = {
2582             'id': videoId,
2583             'url': videoUrl,
2584             'uploader': showName,
2585             'upload_date': None,
2586             'title': showName,
2587             'ext': 'flv',
2588             'thumbnail': imgUrl,
2589             'description': description,
2590             'player_url': playerUrl,
2591         }
2592
2593         return [info]
2594
2595
2596 class CollegeHumorIE(InfoExtractor):
2597     """Information extractor for collegehumor.com"""
2598
2599     _WORKING = False
2600     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2601     IE_NAME = u'collegehumor'
2602
2603     def report_manifest(self, video_id):
2604         """Report information extraction."""
2605         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2606
2607     def report_extraction(self, video_id):
2608         """Report information extraction."""
2609         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2610
2611     def _real_extract(self, url):
2612         mobj = re.match(self._VALID_URL, url)
2613         if mobj is None:
2614             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2615             return
2616         video_id = mobj.group('videoid')
2617
2618         info = {
2619             'id': video_id,
2620             'uploader': None,
2621             'upload_date': None,
2622         }
2623
2624         self.report_extraction(video_id)
2625         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2626         try:
2627             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2628         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2629             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2630             return
2631
2632         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2633         try:
2634             videoNode = mdoc.findall('./video')[0]
2635             info['description'] = videoNode.findall('./description')[0].text
2636             info['title'] = videoNode.findall('./caption')[0].text
2637             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2638             manifest_url = videoNode.findall('./file')[0].text
2639         except IndexError:
2640             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2641             return
2642
2643         manifest_url += '?hdcore=2.10.3'
2644         self.report_manifest(video_id)
2645         try:
2646             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2647         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2649             return
2650
2651         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2652         try:
2653             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2654             node_id = media_node.attrib['url']
2655             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2656         except IndexError as err:
2657             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2658             return
2659
2660         url_pr = compat_urllib_parse_urlparse(manifest_url)
2661         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2662
2663         info['url'] = url
2664         info['ext'] = 'f4f'
2665         return [info]
2666
2667
2668 class XVideosIE(InfoExtractor):
2669     """Information extractor for xvideos.com"""
2670
2671     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2672     IE_NAME = u'xvideos'
2673
2674     def report_webpage(self, video_id):
2675         """Report information extraction."""
2676         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2677
2678     def report_extraction(self, video_id):
2679         """Report information extraction."""
2680         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2681
2682     def _real_extract(self, url):
2683         mobj = re.match(self._VALID_URL, url)
2684         if mobj is None:
2685             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2686             return
2687         video_id = mobj.group(1)
2688
2689         self.report_webpage(video_id)
2690
2691         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2692         try:
2693             webpage_bytes = compat_urllib_request.urlopen(request).read()
2694             webpage = webpage_bytes.decode('utf-8', 'replace')
2695         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2696             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2697             return
2698
2699         self.report_extraction(video_id)
2700
2701
2702         # Extract video URL
2703         mobj = re.search(r'flv_url=(.+?)&', webpage)
2704         if mobj is None:
2705             self._downloader.trouble(u'ERROR: unable to extract video url')
2706             return
2707         video_url = compat_urllib_parse.unquote(mobj.group(1))
2708
2709
2710         # Extract title
2711         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2712         if mobj is None:
2713             self._downloader.trouble(u'ERROR: unable to extract video title')
2714             return
2715         video_title = mobj.group(1)
2716
2717
2718         # Extract video thumbnail
2719         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2720         if mobj is None:
2721             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2722             return
2723         video_thumbnail = mobj.group(0)
2724
2725         info = {
2726             'id': video_id,
2727             'url': video_url,
2728             'uploader': None,
2729             'upload_date': None,
2730             'title': video_title,
2731             'ext': 'flv',
2732             'thumbnail': video_thumbnail,
2733             'description': None,
2734         }
2735
2736         return [info]
2737
2738
2739 class SoundcloudIE(InfoExtractor):
2740     """Information extractor for soundcloud.com
2741        To access the media, the uid of the song and a stream token
2742        must be extracted from the page source and the script must make
2743        a request to media.soundcloud.com/crossdomain.xml. Then
2744        the media can be grabbed by requesting from an url composed
2745        of the stream token and uid
2746      """
2747
2748     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2749     IE_NAME = u'soundcloud'
2750
2751     def __init__(self, downloader=None):
2752         InfoExtractor.__init__(self, downloader)
2753
2754     def report_resolve(self, video_id):
2755         """Report information extraction."""
2756         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2757
2758     def report_extraction(self, video_id):
2759         """Report information extraction."""
2760         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2761
2762     def _real_extract(self, url):
2763         mobj = re.match(self._VALID_URL, url)
2764         if mobj is None:
2765             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2766             return
2767
2768         # extract uploader (which is in the url)
2769         uploader = mobj.group(1)
2770         # extract simple title (uploader + slug of song title)
2771         slug_title =  mobj.group(2)
2772         simple_title = uploader + u'-' + slug_title
2773
2774         self.report_resolve('%s/%s' % (uploader, slug_title))
2775
2776         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2777         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778         request = compat_urllib_request.Request(resolv_url)
2779         try:
2780             info_json_bytes = compat_urllib_request.urlopen(request).read()
2781             info_json = info_json_bytes.decode('utf-8')
2782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2784             return
2785
2786         info = json.loads(info_json)
2787         video_id = info['id']
2788         self.report_extraction('%s/%s' % (uploader, slug_title))
2789
2790         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791         request = compat_urllib_request.Request(streams_url)
2792         try:
2793             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2794             stream_json = stream_json_bytes.decode('utf-8')
2795         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2797             return
2798
2799         streams = json.loads(stream_json)
2800         mediaURL = streams['http_mp3_128_url']
2801
2802         return [{
2803             'id':       info['id'],
2804             'url':      mediaURL,
2805             'uploader': info['user']['username'],
2806             'upload_date':  info['created_at'],
2807             'title':    info['title'],
2808             'ext':      u'mp3',
2809             'description': info['description'],
2810         }]
2811
2812
2813 class InfoQIE(InfoExtractor):
2814     """Information extractor for infoq.com"""
2815
2816     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2817     IE_NAME = u'infoq'
2818
2819     def report_webpage(self, video_id):
2820         """Report information extraction."""
2821         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2822
2823     def report_extraction(self, video_id):
2824         """Report information extraction."""
2825         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2826
2827     def _real_extract(self, url):
2828         mobj = re.match(self._VALID_URL, url)
2829         if mobj is None:
2830             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2831             return
2832
2833         self.report_webpage(url)
2834
2835         request = compat_urllib_request.Request(url)
2836         try:
2837             webpage = compat_urllib_request.urlopen(request).read()
2838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2839             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2840             return
2841
2842         self.report_extraction(url)
2843
2844
2845         # Extract video URL
2846         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2847         if mobj is None:
2848             self._downloader.trouble(u'ERROR: unable to extract video url')
2849             return
2850         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2851
2852
2853         # Extract title
2854         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2855         if mobj is None:
2856             self._downloader.trouble(u'ERROR: unable to extract video title')
2857             return
2858         video_title = mobj.group(1).decode('utf-8')
2859
2860         # Extract description
2861         video_description = u'No description available.'
2862         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2863         if mobj is not None:
2864             video_description = mobj.group(1).decode('utf-8')
2865
2866         video_filename = video_url.split('/')[-1]
2867         video_id, extension = video_filename.split('.')
2868
2869         info = {
2870             'id': video_id,
2871             'url': video_url,
2872             'uploader': None,
2873             'upload_date': None,
2874             'title': video_title,
2875             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2876             'thumbnail': None,
2877             'description': video_description,
2878         }
2879
2880         return [info]
2881
2882 class MixcloudIE(InfoExtractor):
2883     """Information extractor for www.mixcloud.com"""
2884
2885     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2886     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2887     IE_NAME = u'mixcloud'
2888
2889     def __init__(self, downloader=None):
2890         InfoExtractor.__init__(self, downloader)
2891
2892     def report_download_json(self, file_id):
2893         """Report JSON download."""
2894         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2895
2896     def report_extraction(self, file_id):
2897         """Report information extraction."""
2898         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2899
2900     def get_urls(self, jsonData, fmt, bitrate='best'):
2901         """Get urls from 'audio_formats' section in json"""
2902         file_url = None
2903         try:
2904             bitrate_list = jsonData[fmt]
2905             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2906                 bitrate = max(bitrate_list) # select highest
2907
2908             url_list = jsonData[fmt][bitrate]
2909         except TypeError: # we have no bitrate info.
2910             url_list = jsonData[fmt]
2911         return url_list
2912
2913     def check_urls(self, url_list):
2914         """Returns 1st active url from list"""
2915         for url in url_list:
2916             try:
2917                 compat_urllib_request.urlopen(url)
2918                 return url
2919             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2920                 url = None
2921
2922         return None
2923
2924     def _print_formats(self, formats):
2925         print('Available formats:')
2926         for fmt in formats.keys():
2927             for b in formats[fmt]:
2928                 try:
2929                     ext = formats[fmt][b][0]
2930                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2931                 except TypeError: # we have no bitrate info
2932                     ext = formats[fmt][0]
2933                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2934                     break
2935
2936     def _real_extract(self, url):
2937         mobj = re.match(self._VALID_URL, url)
2938         if mobj is None:
2939             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2940             return
2941         # extract uploader & filename from url
2942         uploader = mobj.group(1).decode('utf-8')
2943         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2944
2945         # construct API request
2946         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2947         # retrieve .json file with links to files
2948         request = compat_urllib_request.Request(file_url)
2949         try:
2950             self.report_download_json(file_url)
2951             jsonData = compat_urllib_request.urlopen(request).read()
2952         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2953             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2954             return
2955
2956         # parse JSON
2957         json_data = json.loads(jsonData)
2958         player_url = json_data['player_swf_url']
2959         formats = dict(json_data['audio_formats'])
2960
2961         req_format = self._downloader.params.get('format', None)
2962         bitrate = None
2963
2964         if self._downloader.params.get('listformats', None):
2965             self._print_formats(formats)
2966             return
2967
2968         if req_format is None or req_format == 'best':
2969             for format_param in formats.keys():
2970                 url_list = self.get_urls(formats, format_param)
2971                 # check urls
2972                 file_url = self.check_urls(url_list)
2973                 if file_url is not None:
2974                     break # got it!
2975         else:
2976             if req_format not in formats.keys():
2977                 self._downloader.trouble(u'ERROR: format is not available')
2978                 return
2979
2980             url_list = self.get_urls(formats, req_format)
2981             file_url = self.check_urls(url_list)
2982             format_param = req_format
2983
2984         return [{
2985             'id': file_id.decode('utf-8'),
2986             'url': file_url.decode('utf-8'),
2987             'uploader': uploader.decode('utf-8'),
2988             'upload_date': None,
2989             'title': json_data['name'],
2990             'ext': file_url.split('.')[-1].decode('utf-8'),
2991             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2992             'thumbnail': json_data['thumbnail_url'],
2993             'description': json_data['description'],
2994             'player_url': player_url.decode('utf-8'),
2995         }]
2996
2997 class StanfordOpenClassroomIE(InfoExtractor):
2998     """Information extractor for Stanford's Open ClassRoom"""
2999
3000     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3001     IE_NAME = u'stanfordoc'
3002
3003     def report_download_webpage(self, objid):
3004         """Report information extraction."""
3005         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3006
3007     def report_extraction(self, video_id):
3008         """Report information extraction."""
3009         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3010
3011     def _real_extract(self, url):
3012         mobj = re.match(self._VALID_URL, url)
3013         if mobj is None:
3014             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3015             return
3016
3017         if mobj.group('course') and mobj.group('video'): # A specific video
3018             course = mobj.group('course')
3019             video = mobj.group('video')
3020             info = {
3021                 'id': course + '_' + video,
3022                 'uploader': None,
3023                 'upload_date': None,
3024             }
3025
3026             self.report_extraction(info['id'])
3027             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3028             xmlUrl = baseUrl + video + '.xml'
3029             try:
3030                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3031             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3033                 return
3034             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3035             try:
3036                 info['title'] = mdoc.findall('./title')[0].text
3037                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3038             except IndexError:
3039                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3040                 return
3041             info['ext'] = info['url'].rpartition('.')[2]
3042             return [info]
3043         elif mobj.group('course'): # A course page
3044             course = mobj.group('course')
3045             info = {
3046                 'id': course,
3047                 'type': 'playlist',
3048                 'uploader': None,
3049                 'upload_date': None,
3050             }
3051
3052             self.report_download_webpage(info['id'])
3053             try:
3054                 coursepage = compat_urllib_request.urlopen(url).read()
3055             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3056                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3057                 return
3058
3059             m = re.search('<h1>([^<]+)</h1>', coursepage)
3060             if m:
3061                 info['title'] = unescapeHTML(m.group(1))
3062             else:
3063                 info['title'] = info['id']
3064
3065             m = re.search('<description>([^<]+)</description>', coursepage)
3066             if m:
3067                 info['description'] = unescapeHTML(m.group(1))
3068
3069             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3070             info['list'] = [
3071                 {
3072                     'type': 'reference',
3073                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3074                 }
3075                     for vpage in links]
3076             results = []
3077             for entry in info['list']:
3078                 assert entry['type'] == 'reference'
3079                 results += self.extract(entry['url'])
3080             return results
3081
3082         else: # Root page
3083             info = {
3084                 'id': 'Stanford OpenClassroom',
3085                 'type': 'playlist',
3086                 'uploader': None,
3087                 'upload_date': None,
3088             }
3089
3090             self.report_download_webpage(info['id'])
3091             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3092             try:
3093                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3094             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3095                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3096                 return
3097
3098             info['title'] = info['id']
3099
3100             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3101             info['list'] = [
3102                 {
3103                     'type': 'reference',
3104                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3105                 }
3106                     for cpage in links]
3107
3108             results = []
3109             for entry in info['list']:
3110                 assert entry['type'] == 'reference'
3111                 results += self.extract(entry['url'])
3112             return results
3113
3114 class MTVIE(InfoExtractor):
3115     """Information extractor for MTV.com"""
3116
3117     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3118     IE_NAME = u'mtv'
3119
3120     def report_webpage(self, video_id):
3121         """Report information extraction."""
3122         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3123
3124     def report_extraction(self, video_id):
3125         """Report information extraction."""
3126         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3127
3128     def _real_extract(self, url):
3129         mobj = re.match(self._VALID_URL, url)
3130         if mobj is None:
3131             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3132             return
3133         if not mobj.group('proto'):
3134             url = 'http://' + url
3135         video_id = mobj.group('videoid')
3136         self.report_webpage(video_id)
3137
3138         request = compat_urllib_request.Request(url)
3139         try:
3140             webpage = compat_urllib_request.urlopen(request).read()
3141         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3143             return
3144
3145         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3146         if mobj is None:
3147             self._downloader.trouble(u'ERROR: unable to extract song name')
3148             return
3149         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3150         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3151         if mobj is None:
3152             self._downloader.trouble(u'ERROR: unable to extract performer')
3153             return
3154         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3155         video_title = performer + ' - ' + song_name
3156
3157         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3158         if mobj is None:
3159             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3160             return
3161         mtvn_uri = mobj.group(1)
3162
3163         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3164         if mobj is None:
3165             self._downloader.trouble(u'ERROR: unable to extract content id')
3166             return
3167         content_id = mobj.group(1)
3168
3169         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3170         self.report_extraction(video_id)
3171         request = compat_urllib_request.Request(videogen_url)
3172         try:
3173             metadataXml = compat_urllib_request.urlopen(request).read()
3174         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3175             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3176             return
3177
3178         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3179         renditions = mdoc.findall('.//rendition')
3180
3181         # For now, always pick the highest quality.
3182         rendition = renditions[-1]
3183
3184         try:
3185             _,_,ext = rendition.attrib['type'].partition('/')
3186             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3187             video_url = rendition.find('./src').text
3188         except KeyError:
3189             self._downloader.trouble('Invalid rendition field.')
3190             return
3191
3192         info = {
3193             'id': video_id,
3194             'url': video_url,
3195             'uploader': performer,
3196             'upload_date': None,
3197             'title': video_title,
3198             'ext': ext,
3199             'format': format,
3200         }
3201
3202         return [info]
3203
3204
3205 class YoukuIE(InfoExtractor):
3206
3207     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3208     IE_NAME = u'Youku'
3209
3210     def __init__(self, downloader=None):
3211         InfoExtractor.__init__(self, downloader)
3212
3213     def report_download_webpage(self, file_id):
3214         """Report webpage download."""
3215         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3216
3217     def report_extraction(self, file_id):
3218         """Report information extraction."""
3219         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3220
3221     def _gen_sid(self):
3222         nowTime = int(time.time() * 1000)
3223         random1 = random.randint(1000,1998)
3224         random2 = random.randint(1000,9999)
3225
3226         return "%d%d%d" %(nowTime,random1,random2)
3227
3228     def _get_file_ID_mix_string(self, seed):
3229         mixed = []
3230         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3231         seed = float(seed)
3232         for i in range(len(source)):
3233             seed  =  (seed * 211 + 30031 ) % 65536
3234             index  =  math.floor(seed / 65536 * len(source) )
3235             mixed.append(source[int(index)])
3236             source.remove(source[int(index)])
3237         #return ''.join(mixed)
3238         return mixed
3239
3240     def _get_file_id(self, fileId, seed):
3241         mixed = self._get_file_ID_mix_string(seed)
3242         ids = fileId.split('*')
3243         realId = []
3244         for ch in ids:
3245             if ch:
3246                 realId.append(mixed[int(ch)])
3247         return ''.join(realId)
3248
3249     def _real_extract(self, url):
3250         mobj = re.match(self._VALID_URL, url)
3251         if mobj is None:
3252             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3253             return
3254         video_id = mobj.group('ID')
3255
3256         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3257
3258         request = compat_urllib_request.Request(info_url, None, std_headers)
3259         try:
3260             self.report_download_webpage(video_id)
3261             jsondata = compat_urllib_request.urlopen(request).read()
3262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3264             return
3265
3266         self.report_extraction(video_id)
3267         try:
3268             jsonstr = jsondata.decode('utf-8')
3269             config = json.loads(jsonstr)
3270
3271             video_title =  config['data'][0]['title']
3272             seed = config['data'][0]['seed']
3273
3274             format = self._downloader.params.get('format', None)
3275             supported_format = config['data'][0]['streamfileids'].keys()
3276
3277             if format is None or format == 'best':
3278                 if 'hd2' in supported_format:
3279                     format = 'hd2'
3280                 else:
3281                     format = 'flv'
3282                 ext = u'flv'
3283             elif format == 'worst':
3284                 format = 'mp4'
3285                 ext = u'mp4'
3286             else:
3287                 format = 'flv'
3288                 ext = u'flv'
3289
3290
3291             fileid = config['data'][0]['streamfileids'][format]
3292             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3293         except (UnicodeDecodeError, ValueError, KeyError):
3294             self._downloader.trouble(u'ERROR: unable to extract info section')
3295             return
3296
3297         files_info=[]
3298         sid = self._gen_sid()
3299         fileid = self._get_file_id(fileid, seed)
3300
3301         #column 8,9 of fileid represent the segment number
3302         #fileid[7:9] should be changed
3303         for index, key in enumerate(keys):
3304
3305             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3306             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3307
3308             info = {
3309                 'id': '%s_part%02d' % (video_id, index),
3310                 'url': download_url,
3311                 'uploader': None,
3312                 'upload_date': None,
3313                 'title': video_title,
3314                 'ext': ext,
3315             }
3316             files_info.append(info)
3317
3318         return files_info
3319
3320
3321 class XNXXIE(InfoExtractor):
3322     """Information extractor for xnxx.com"""
3323
3324     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3325     IE_NAME = u'xnxx'
3326     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3327     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3328     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3329
3330     def report_webpage(self, video_id):
3331         """Report information extraction"""
3332         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3333
3334     def report_extraction(self, video_id):
3335         """Report information extraction"""
3336         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3337
3338     def _real_extract(self, url):
3339         mobj = re.match(self._VALID_URL, url)
3340         if mobj is None:
3341             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3342             return
3343         video_id = mobj.group(1)
3344
3345         self.report_webpage(video_id)
3346
3347         # Get webpage content
3348         try:
3349             webpage_bytes = compat_urllib_request.urlopen(url).read()
3350             webpage = webpage_bytes.decode('utf-8')
3351         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3352             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3353             return
3354
3355         result = re.search(self.VIDEO_URL_RE, webpage)
3356         if result is None:
3357             self._downloader.trouble(u'ERROR: unable to extract video url')
3358             return
3359         video_url = compat_urllib_parse.unquote(result.group(1))
3360
3361         result = re.search(self.VIDEO_TITLE_RE, webpage)
3362         if result is None:
3363             self._downloader.trouble(u'ERROR: unable to extract video title')
3364             return
3365         video_title = result.group(1)
3366
3367         result = re.search(self.VIDEO_THUMB_RE, webpage)
3368         if result is None:
3369             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3370             return
3371         video_thumbnail = result.group(1)
3372
3373         return [{
3374             'id': video_id,
3375             'url': video_url,
3376             'uploader': None,
3377             'upload_date': None,
3378             'title': video_title,
3379             'ext': 'flv',
3380             'thumbnail': video_thumbnail,
3381             'description': None,
3382         }]
3383
3384
3385 class GooglePlusIE(InfoExtractor):
3386     """Information extractor for plus.google.com."""
3387
3388     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3389     IE_NAME = u'plus.google'
3390
3391     def __init__(self, downloader=None):
3392         InfoExtractor.__init__(self, downloader)
3393
3394     def report_extract_entry(self, url):
3395         """Report downloading extry"""
3396         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3397
3398     def report_date(self, upload_date):
3399         """Report downloading extry"""
3400         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3401
3402     def report_uploader(self, uploader):
3403         """Report downloading extry"""
3404         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3405
3406     def report_title(self, video_title):
3407         """Report downloading extry"""
3408         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3409
3410     def report_extract_vid_page(self, video_page):
3411         """Report information extraction."""
3412         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3413
3414     def _real_extract(self, url):
3415         # Extract id from URL
3416         mobj = re.match(self._VALID_URL, url)
3417         if mobj is None:
3418             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3419             return
3420
3421         post_url = mobj.group(0)
3422         video_id = mobj.group(1)
3423
3424         video_extension = 'flv'
3425
3426         # Step 1, Retrieve post webpage to extract further information
3427         self.report_extract_entry(post_url)
3428         request = compat_urllib_request.Request(post_url)
3429         try:
3430             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3431         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3432             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3433             return
3434
3435         # Extract update date
3436         upload_date = None
3437         pattern = 'title="Timestamp">(.*?)</a>'
3438         mobj = re.search(pattern, webpage)
3439         if mobj:
3440             upload_date = mobj.group(1)
3441             # Convert timestring to a format suitable for filename
3442             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3443             upload_date = upload_date.strftime('%Y%m%d')
3444         self.report_date(upload_date)
3445
3446         # Extract uploader
3447         uploader = None
3448         pattern = r'rel\="author".*?>(.*?)</a>'
3449         mobj = re.search(pattern, webpage)
3450         if mobj:
3451             uploader = mobj.group(1)
3452         self.report_uploader(uploader)
3453
3454         # Extract title
3455         # Get the first line for title
3456         video_title = u'NA'
3457         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3458         mobj = re.search(pattern, webpage)
3459         if mobj:
3460             video_title = mobj.group(1)
3461         self.report_title(video_title)
3462
3463         # Step 2, Stimulate clicking the image box to launch video
3464         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3465         mobj = re.search(pattern, webpage)
3466         if mobj is None:
3467             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3468
3469         video_page = mobj.group(1)
3470         request = compat_urllib_request.Request(video_page)
3471         try:
3472             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3474             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3475             return
3476         self.report_extract_vid_page(video_page)
3477
3478
3479         # Extract video links on video page
3480         """Extract video links of all sizes"""
3481         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3482         mobj = re.findall(pattern, webpage)
3483         if len(mobj) == 0:
3484             self._downloader.trouble(u'ERROR: unable to extract video links')
3485
3486         # Sort in resolution
3487         links = sorted(mobj)
3488
3489         # Choose the lowest of the sort, i.e. highest resolution
3490         video_url = links[-1]
3491         # Only get the url. The resolution part in the tuple has no use anymore
3492         video_url = video_url[-1]
3493         # Treat escaped \u0026 style hex
3494         try:
3495             video_url = video_url.decode("unicode_escape")
3496         except AttributeError: # Python 3
3497             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3498
3499
3500         return [{
3501             'id':       video_id,
3502             'url':      video_url,
3503             'uploader': uploader,
3504             'upload_date':  upload_date,
3505             'title':    video_title,
3506             'ext':      video_extension,
3507         }]
3508
3509 class NBAIE(InfoExtractor):
3510     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3511     IE_NAME = u'nba'
3512
3513     def report_extraction(self, video_id):
3514         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3515
3516     def _real_extract(self, url):
3517         mobj = re.match(self._VALID_URL, url)
3518         if mobj is None:
3519             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3520             return
3521
3522         video_id = mobj.group(1)
3523         if video_id.endswith('/index.html'):
3524             video_id = video_id[:-len('/index.html')]
3525
3526         self.report_extraction(video_id)
3527         try:
3528             urlh = compat_urllib_request.urlopen(url)
3529             webpage_bytes = urlh.read()
3530             webpage = webpage_bytes.decode('utf-8', 'ignore')
3531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3532             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3533             return
3534
3535         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3536         def _findProp(rexp, default=None):
3537             m = re.search(rexp, webpage)
3538             if m:
3539                 return unescapeHTML(m.group(1))
3540             else:
3541                 return default
3542
3543         shortened_video_id = video_id.rpartition('/')[2]
3544         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3545         info = {
3546             'id': shortened_video_id,
3547             'url': video_url,
3548             'ext': 'mp4',
3549             'title': title,
3550             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3551             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3552         }
3553         return [info]
3554
3555 class JustinTVIE(InfoExtractor):
3556     """Information extractor for justin.tv and twitch.tv"""
3557     # TODO: One broadcast may be split into multiple videos. The key
3558     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3559     # starts at 1 and increases. Can we treat all parts as one video?
3560
3561     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3562         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3563     _JUSTIN_PAGE_LIMIT = 100
3564     IE_NAME = u'justin.tv'
3565
3566     def report_extraction(self, file_id):
3567         """Report information extraction."""
3568         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3569
3570     def report_download_page(self, channel, offset):
3571         """Report attempt to download a single page of videos."""
3572         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3573                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3574
3575     # Return count of items, list of *valid* items
3576     def _parse_page(self, url):
3577         try:
3578             urlh = compat_urllib_request.urlopen(url)
3579             webpage_bytes = urlh.read()
3580             webpage = webpage_bytes.decode('utf-8', 'ignore')
3581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3582             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3583             return
3584
3585         response = json.loads(webpage)
3586         info = []
3587         for clip in response:
3588             video_url = clip['video_file_url']
3589             if video_url:
3590                 video_extension = os.path.splitext(video_url)[1][1:]
3591                 video_date = re.sub('-', '', clip['created_on'][:10])
3592                 info.append({
3593                     'id': clip['id'],
3594                     'url': video_url,
3595                     'title': clip['title'],
3596                     'uploader': clip.get('user_id', clip.get('channel_id')),
3597                     'upload_date': video_date,
3598                     'ext': video_extension,
3599                 })
3600         return (len(response), info)
3601
3602     def _real_extract(self, url):
3603         mobj = re.match(self._VALID_URL, url)
3604         if mobj is None:
3605             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3606             return
3607
3608         api = 'http://api.justin.tv'
3609         video_id = mobj.group(mobj.lastindex)
3610         paged = False
3611         if mobj.lastindex == 1:
3612             paged = True
3613             api += '/channel/archives/%s.json'
3614         else:
3615             api += '/clip/show/%s.json'
3616         api = api % (video_id,)
3617
3618         self.report_extraction(video_id)
3619
3620         info = []
3621         offset = 0
3622         limit = self._JUSTIN_PAGE_LIMIT
3623         while True:
3624             if paged:
3625                 self.report_download_page(video_id, offset)
3626             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3627             page_count, page_info = self._parse_page(page_url)
3628             info.extend(page_info)
3629             if not paged or page_count != limit:
3630                 break
3631             offset += limit
3632         return info