youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The .srt file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         webpage_bytes = urlh.read()
 130         return webpage_bytes.decode('utf-8', 'replace')
 131
 132
 133 class YoutubeIE(InfoExtractor):
 134     """Information extractor for youtube.com."""
 135
 136     _VALID_URL = r"""^
 137                      (
 138                          (?:https?://)?                                       # http(s):// (optional)
 139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 142                          (?:                                                  # the various things that can precede the ID:
 143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 144                              |(?:                                             # or the v= param in all its forms
 145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 148                                  v=
 149                              )
 150                          )?                                                   # optional -> youtube.com/xxxx is OK
 151                      )?                                                       # all until now is optional -> you can pass the naked ID
 152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 153                      (?(1).+)?                                                # if we found the ID, everything can follow
 154                      $"""
 155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 159     _NETRC_MACHINE = 'youtube'
 160     # Listed in order of quality
 161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 163     _video_extensions = {
 164         '13': '3gp',
 165         '17': 'mp4',
 166         '18': 'mp4',
 167         '22': 'mp4',
 168         '37': 'mp4',
 169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 170         '43': 'webm',
 171         '44': 'webm',
 172         '45': 'webm',
 173         '46': 'webm',
 174     }
 175     _video_dimensions = {
 176         '5': '240x400',
 177         '6': '???',
 178         '13': '???',
 179         '17': '144x176',
 180         '18': '360x640',
 181         '22': '720x1280',
 182         '34': '360x640',
 183         '35': '480x854',
 184         '37': '1080x1920',
 185         '38': '3072x4096',
 186         '43': '360x640',
 187         '44': '480x854',
 188         '45': '720x1280',
 189         '46': '1080x1920',
 190     }
 191     IE_NAME = u'youtube'
 192
 193     @classmethod
 194     def suitable(cls, url):
 195         """Receives a URL and returns True if suitable for this IE."""
 196         if YoutubePlaylistIE.suitable(url): return False
 197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 198
 199     def report_lang(self):
 200         """Report attempt to set language."""
 201         self._downloader.to_screen(u'[youtube] Setting language')
 202
 203     def report_login(self):
 204         """Report attempt to log in."""
 205         self._downloader.to_screen(u'[youtube] Logging in')
 206
 207     def report_age_confirmation(self):
 208         """Report attempt to confirm age."""
 209         self._downloader.to_screen(u'[youtube] Confirming age')
 210
 211     def report_video_webpage_download(self, video_id):
 212         """Report attempt to download video webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 214
 215     def report_video_info_webpage_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 218
 219     def report_video_subtitles_download(self, video_id):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 222
 223     def report_information_extraction(self, video_id):
 224         """Report attempt to extract video information."""
 225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 226
 227     def report_unavailable_format(self, video_id, format):
 228         """Report extracted video URL."""
 229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 230
 231     def report_rtmp_download(self):
 232         """Indicate the download will use the RTMP protocol."""
 233         self._downloader.to_screen(u'[youtube] RTMP download detected')
 234
 235     def _closed_captions_xml_to_srt(self, xml_string):
 236         srt = ''
 237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 238         # TODO parse xml instead of regex
 239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 240             if not dur: dur = '4'
 241             start = float(start)
 242             end = start + float(dur)
 243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 245             caption = unescapeHTML(caption)
 246             caption = unescapeHTML(caption) # double cycle, intentional
 247             srt += str(n+1) + '\n'
 248             srt += start + ' --> ' + end + '\n'
 249             srt += caption + '\n\n'
 250         return srt
 251
 252     def _extract_subtitles(self, video_id):
 253         self.report_video_subtitles_download(video_id)
 254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 255         try:
 256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 261         if not srt_lang_list:
 262             return (u'WARNING: video has no closed captions', None)
 263         if self._downloader.params.get('subtitleslang', False):
 264             srt_lang = self._downloader.params.get('subtitleslang')
 265         elif 'en' in srt_lang_list:
 266             srt_lang = 'en'
 267         else:
 268             srt_lang = list(srt_lang_list.keys())[0]
 269         if not srt_lang in srt_lang_list:
 270             return (u'WARNING: no closed captions found in the specified language', None)
 271         params = compat_urllib_parse.urlencode({
 272             'lang': srt_lang,
 273             'name': srt_lang_list[srt_lang].encode('utf-8'),
 274             'v': video_id,
 275         })
 276         url = 'http://www.youtube.com/api/timedtext?' + params
 277         try:
 278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 281         if not srt_xml:
 282             return (u'WARNING: Did not fetch video subtitles', None)
 283         return (None, self._closed_captions_xml_to_srt(srt_xml))
 284
 285     def _print_formats(self, formats):
 286         print('Available formats:')
 287         for x in formats:
 288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 289
 290     def _real_initialize(self):
 291         if self._downloader is None:
 292             return
 293
 294         username = None
 295         password = None
 296         downloader_params = self._downloader.params
 297
 298         # Attempt to use provided username and password or .netrc data
 299         if downloader_params.get('username', None) is not None:
 300             username = downloader_params['username']
 301             password = downloader_params['password']
 302         elif downloader_params.get('usenetrc', False):
 303             try:
 304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 305                 if info is not None:
 306                     username = info[0]
 307                     password = info[2]
 308                 else:
 309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 310             except (IOError, netrc.NetrcParseError) as err:
 311                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 312                 return
 313
 314         # Set language
 315         request = compat_urllib_request.Request(self._LANG_URL)
 316         try:
 317             self.report_lang()
 318             compat_urllib_request.urlopen(request).read()
 319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 320             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 321             return
 322
 323         # No authentication to be performed
 324         if username is None:
 325             return
 326
 327         request = compat_urllib_request.Request(self._LOGIN_URL)
 328         try:
 329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 331             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 332             return
 333
 334         galx = None
 335         dsh = None
 336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           galx = match.group(1)
 339
 340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 341         if match:
 342           dsh = match.group(1)
 343
 344         # Log in
 345         login_form_strs = {
 346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 347                 u'Email': username,
 348                 u'GALX': galx,
 349                 u'Passwd': password,
 350                 u'PersistentCookie': u'yes',
 351                 u'_utf8': u'霱',
 352                 u'bgresponse': u'js_disabled',
 353                 u'checkConnection': u'',
 354                 u'checkedDomains': u'youtube',
 355                 u'dnConn': u'',
 356                 u'dsh': dsh,
 357                 u'pstMsg': u'0',
 358                 u'rmShown': u'1',
 359                 u'secTok': u'',
 360                 u'signIn': u'Sign in',
 361                 u'timeStmp': u'',
 362                 u'service': u'youtube',
 363                 u'uilel': u'3',
 364                 u'hl': u'en_US',
 365         }
 366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 367         # chokes on unicode
 368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 371         try:
 372             self.report_login()
 373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 375                 self._downloader.report_warning(u'unable to log in: bad username or password')
 376                 return
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 379             return
 380
 381         # Confirm age
 382         age_form = {
 383                 'next_url':     '/',
 384                 'action_confirm':   'Confirm',
 385                 }
 386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 387         try:
 388             self.report_age_confirmation()
 389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 392             return
 393
 394     def _extract_id(self, url):
 395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 396         if mobj is None:
 397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 398             return
 399         video_id = mobj.group(2)
 400         return video_id
 401
 402     def _real_extract(self, url):
 403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 404         mobj = re.search(self._NEXT_URL_RE, url)
 405         if mobj:
 406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 407         video_id = self._extract_id(url)
 408
 409         # Get video webpage
 410         self.report_video_webpage_download(video_id)
 411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 412         request = compat_urllib_request.Request(url)
 413         try:
 414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 417             return
 418
 419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 420
 421         # Attempt to extract SWF player URL
 422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 423         if mobj is not None:
 424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 425         else:
 426             player_url = None
 427
 428         # Get video info
 429         self.report_video_info_webpage_download(video_id)
 430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 432                     % (video_id, el_type))
 433             request = compat_urllib_request.Request(video_info_url)
 434             try:
 435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 437                 video_info = compat_parse_qs(video_info_webpage)
 438                 if 'token' in video_info:
 439                     break
 440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 442                 return
 443         if 'token' not in video_info:
 444             if 'reason' in video_info:
 445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 446             else:
 447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 448             return
 449
 450         # Check for "rental" videos
 451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 453             return
 454
 455         # Start extracting information
 456         self.report_information_extraction(video_id)
 457
 458         # uploader
 459         if 'author' not in video_info:
 460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 461             return
 462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 463
 464         # uploader_id
 465         video_uploader_id = None
 466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 467         if mobj is not None:
 468             video_uploader_id = mobj.group(1)
 469         else:
 470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 471
 472         # title
 473         if 'title' not in video_info:
 474             self._downloader.trouble(u'ERROR: unable to extract video title')
 475             return
 476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 477
 478         # thumbnail image
 479         if 'thumbnail_url' not in video_info:
 480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 481             video_thumbnail = ''
 482         else:   # don't panic if we can't find it
 483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 484
 485         # upload date
 486         upload_date = None
 487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 488         if mobj is not None:
 489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 491             for expression in format_expressions:
 492                 try:
 493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 494                 except:
 495                     pass
 496
 497         # description
 498         video_description = get_element_by_id("eow-description", video_webpage)
 499         if video_description:
 500             video_description = clean_html(video_description)
 501         else:
 502             video_description = ''
 503
 504         # closed captions
 505         video_subtitles = None
 506         if self._downloader.params.get('writesubtitles', False):
 507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 508             if srt_error:
 509                 self._downloader.trouble(srt_error)
 510
 511         if 'length_seconds' not in video_info:
 512             self._downloader.trouble(u'WARNING: unable to extract video duration')
 513             video_duration = ''
 514         else:
 515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 516
 517         # token
 518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 519
 520         # Decide which formats to download
 521         req_format = self._downloader.params.get('format', None)
 522
 523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 524             self.report_rtmp_download()
 525             video_url_list = [(None, video_info['conn'][0])]
 526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 531
 532             format_limit = self._downloader.params.get('format_limit', None)
 533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 534             if format_limit is not None and format_limit in available_formats:
 535                 format_list = available_formats[available_formats.index(format_limit):]
 536             else:
 537                 format_list = available_formats
 538             existing_formats = [x for x in format_list if x in url_map]
 539             if len(existing_formats) == 0:
 540                 self._downloader.trouble(u'ERROR: no known formats available for video')
 541                 return
 542             if self._downloader.params.get('listformats', None):
 543                 self._print_formats(existing_formats)
 544                 return
 545             if req_format is None or req_format == 'best':
 546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 547             elif req_format == 'worst':
 548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 549             elif req_format in ('-1', 'all'):
 550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 551             else:
 552                 # Specific formats. We pick the first in a slash-delimeted sequence.
 553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 554                 req_formats = req_format.split('/')
 555                 video_url_list = None
 556                 for rf in req_formats:
 557                     if rf in url_map:
 558                         video_url_list = [(rf, url_map[rf])]
 559                         break
 560                 if video_url_list is None:
 561                     self._downloader.trouble(u'ERROR: requested format not available')
 562                     return
 563         else:
 564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 565             return
 566
 567         results = []
 568         for format_param, video_real_url in video_url_list:
 569             # Extension
 570             video_extension = self._video_extensions.get(format_param, 'flv')
 571
 572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 573                                               self._video_dimensions.get(format_param, '???'))
 574
 575             results.append({
 576                 'id':       video_id,
 577                 'url':      video_real_url,
 578                 'uploader': video_uploader,
 579                 'uploader_id': video_uploader_id,
 580                 'upload_date':  upload_date,
 581                 'title':    video_title,
 582                 'ext':      video_extension,
 583                 'format':   video_format,
 584                 'thumbnail':    video_thumbnail,
 585                 'description':  video_description,
 586                 'player_url':   player_url,
 587                 'subtitles':    video_subtitles,
 588                 'duration':     video_duration
 589             })
 590         return results
 591
 592
 593 class MetacafeIE(InfoExtractor):
 594     """Information Extractor for metacafe.com."""
 595
 596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 599     IE_NAME = u'metacafe'
 600
 601     def __init__(self, downloader=None):
 602         InfoExtractor.__init__(self, downloader)
 603
 604     def report_disclaimer(self):
 605         """Report disclaimer retrieval."""
 606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 607
 608     def report_age_confirmation(self):
 609         """Report attempt to confirm age."""
 610         self._downloader.to_screen(u'[metacafe] Confirming age')
 611
 612     def report_download_webpage(self, video_id):
 613         """Report webpage download."""
 614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 615
 616     def report_extraction(self, video_id):
 617         """Report information extraction."""
 618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 619
 620     def _real_initialize(self):
 621         # Retrieve disclaimer
 622         request = compat_urllib_request.Request(self._DISCLAIMER)
 623         try:
 624             self.report_disclaimer()
 625             disclaimer = compat_urllib_request.urlopen(request).read()
 626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 628             return
 629
 630         # Confirm age
 631         disclaimer_form = {
 632             'filters': '0',
 633             'submit': "Continue - I'm over 18",
 634             }
 635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 636         try:
 637             self.report_age_confirmation()
 638             disclaimer = compat_urllib_request.urlopen(request).read()
 639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 641             return
 642
 643     def _real_extract(self, url):
 644         # Extract id and simplified title from URL
 645         mobj = re.match(self._VALID_URL, url)
 646         if mobj is None:
 647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 648             return
 649
 650         video_id = mobj.group(1)
 651
 652         # Check if video comes from YouTube
 653         mobj2 = re.match(r'^yt-(.*)$', video_id)
 654         if mobj2 is not None:
 655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 656             return
 657
 658         # Retrieve video webpage to extract further information
 659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 660         try:
 661             self.report_download_webpage(video_id)
 662             webpage = compat_urllib_request.urlopen(request).read()
 663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 665             return
 666
 667         # Extract URL, uploader and title from webpage
 668         self.report_extraction(video_id)
 669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 670         if mobj is not None:
 671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 672             video_extension = mediaURL[-3:]
 673
 674             # Extract gdaKey if available
 675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 676             if mobj is None:
 677                 video_url = mediaURL
 678             else:
 679                 gdaKey = mobj.group(1)
 680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 681         else:
 682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 683             if mobj is None:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             vardict = compat_parse_qs(mobj.group(1))
 687             if 'mediaData' not in vardict:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 691             if mobj is None:
 692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 693                 return
 694             mediaURL = mobj.group(1).replace('\\/', '/')
 695             video_extension = mediaURL[-3:]
 696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 697
 698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 699         if mobj is None:
 700             self._downloader.trouble(u'ERROR: unable to extract title')
 701             return
 702         video_title = mobj.group(1).decode('utf-8')
 703
 704         mobj = re.search(r'submitter=(.*?);', webpage)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 707             return
 708         video_uploader = mobj.group(1)
 709
 710         return [{
 711             'id':       video_id.decode('utf-8'),
 712             'url':      video_url.decode('utf-8'),
 713             'uploader': video_uploader.decode('utf-8'),
 714             'upload_date':  None,
 715             'title':    video_title,
 716             'ext':      video_extension.decode('utf-8'),
 717         }]
 718
 719
 720 class DailymotionIE(InfoExtractor):
 721     """Information Extractor for Dailymotion"""
 722
 723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 724     IE_NAME = u'dailymotion'
 725     _WORKING = False
 726
 727     def __init__(self, downloader=None):
 728         InfoExtractor.__init__(self, downloader)
 729
 730     def report_extraction(self, video_id):
 731         """Report information extraction."""
 732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 733
 734     def _real_extract(self, url):
 735         # Extract id and simplified title from URL
 736         mobj = re.match(self._VALID_URL, url)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 739             return
 740
 741         video_id = mobj.group(1).split('_')[0].split('?')[0]
 742
 743         video_extension = 'mp4'
 744
 745         # Retrieve video webpage to extract further information
 746         request = compat_urllib_request.Request(url)
 747         request.add_header('Cookie', 'family_filter=off')
 748         webpage = self._download_webpage(request, video_id)
 749
 750         # Extract URL, uploader and title from webpage
 751         self.report_extraction(video_id)
 752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 753         if mobj is None:
 754             self._downloader.trouble(u'ERROR: unable to extract media URL')
 755             return
 756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 757
 758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 759             if key in flashvars:
 760                 max_quality = key
 761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 762                 break
 763         else:
 764             self._downloader.trouble(u'ERROR: unable to extract video URL')
 765             return
 766
 767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 768         if mobj is None:
 769             self._downloader.trouble(u'ERROR: unable to extract video URL')
 770             return
 771
 772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 773
 774         # TODO: support choosing qualities
 775
 776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: unable to extract title')
 779             return
 780         video_title = unescapeHTML(mobj.group('title'))
 781
 782         video_uploader = None
 783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 784         if mobj is None:
 785             # lookin for official user
 786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 787             if mobj_official is None:
 788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 789             else:
 790                 video_uploader = mobj_official.group(1)
 791         else:
 792             video_uploader = mobj.group(1)
 793
 794         video_upload_date = None
 795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 796         if mobj is not None:
 797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 798
 799         return [{
 800             'id':       video_id,
 801             'url':      video_url,
 802             'uploader': video_uploader,
 803             'upload_date':  video_upload_date,
 804             'title':    video_title,
 805             'ext':      video_extension,
 806         }]
 807
 808
 809 class PhotobucketIE(InfoExtractor):
 810     """Information extractor for photobucket.com."""
 811
 812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 813     IE_NAME = u'photobucket'
 814
 815     def __init__(self, downloader=None):
 816         InfoExtractor.__init__(self, downloader)
 817
 818     def report_download_webpage(self, video_id):
 819         """Report webpage download."""
 820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 821
 822     def report_extraction(self, video_id):
 823         """Report information extraction."""
 824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 825
 826     def _real_extract(self, url):
 827         # Extract id from URL
 828         mobj = re.match(self._VALID_URL, url)
 829         if mobj is None:
 830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 831             return
 832
 833         video_id = mobj.group(1)
 834
 835         video_extension = 'flv'
 836
 837         # Retrieve video webpage to extract further information
 838         request = compat_urllib_request.Request(url)
 839         try:
 840             self.report_download_webpage(video_id)
 841             webpage = compat_urllib_request.urlopen(request).read()
 842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 844             return
 845
 846         # Extract URL, uploader, and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 849         if mobj is None:
 850             self._downloader.trouble(u'ERROR: unable to extract media URL')
 851             return
 852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 853
 854         video_url = mediaURL
 855
 856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 857         if mobj is None:
 858             self._downloader.trouble(u'ERROR: unable to extract title')
 859             return
 860         video_title = mobj.group(1).decode('utf-8')
 861
 862         video_uploader = mobj.group(2).decode('utf-8')
 863
 864         return [{
 865             'id':       video_id.decode('utf-8'),
 866             'url':      video_url.decode('utf-8'),
 867             'uploader': video_uploader,
 868             'upload_date':  None,
 869             'title':    video_title,
 870             'ext':      video_extension.decode('utf-8'),
 871         }]
 872
 873
 874 class YahooIE(InfoExtractor):
 875     """Information extractor for video.yahoo.com."""
 876
 877     _WORKING = False
 878     # _VALID_URL matches all Yahoo! Video URLs
 879     # _VPAGE_URL matches only the extractable '/watch/' URLs
 880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 882     IE_NAME = u'video.yahoo'
 883
 884     def __init__(self, downloader=None):
 885         InfoExtractor.__init__(self, downloader)
 886
 887     def report_download_webpage(self, video_id):
 888         """Report webpage download."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 890
 891     def report_extraction(self, video_id):
 892         """Report information extraction."""
 893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 894
 895     def _real_extract(self, url, new_video=True):
 896         # Extract ID from URL
 897         mobj = re.match(self._VALID_URL, url)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 900             return
 901
 902         video_id = mobj.group(2)
 903         video_extension = 'flv'
 904
 905         # Rewrite valid but non-extractable URLs as
 906         # extractable English language /watch/ URLs
 907         if re.match(self._VPAGE_URL, url) is None:
 908             request = compat_urllib_request.Request(url)
 909             try:
 910                 webpage = compat_urllib_request.urlopen(request).read()
 911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 913                 return
 914
 915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 916             if mobj is None:
 917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 918                 return
 919             yahoo_id = mobj.group(1)
 920
 921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 922             if mobj is None:
 923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 924                 return
 925             yahoo_vid = mobj.group(1)
 926
 927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 928             return self._real_extract(url, new_video=False)
 929
 930         # Retrieve video webpage to extract further information
 931         request = compat_urllib_request.Request(url)
 932         try:
 933             self.report_download_webpage(video_id)
 934             webpage = compat_urllib_request.urlopen(request).read()
 935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 937             return
 938
 939         # Extract uploader and title from webpage
 940         self.report_extraction(video_id)
 941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: unable to extract video title')
 944             return
 945         video_title = mobj.group(1).decode('utf-8')
 946
 947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 948         if mobj is None:
 949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 950             return
 951         video_uploader = mobj.group(1).decode('utf-8')
 952
 953         # Extract video thumbnail
 954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 955         if mobj is None:
 956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 957             return
 958         video_thumbnail = mobj.group(1).decode('utf-8')
 959
 960         # Extract video description
 961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 962         if mobj is None:
 963             self._downloader.trouble(u'ERROR: unable to extract video description')
 964             return
 965         video_description = mobj.group(1).decode('utf-8')
 966         if not video_description:
 967             video_description = 'No description available.'
 968
 969         # Extract video height and width
 970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 971         if mobj is None:
 972             self._downloader.trouble(u'ERROR: unable to extract video height')
 973             return
 974         yv_video_height = mobj.group(1)
 975
 976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 977         if mobj is None:
 978             self._downloader.trouble(u'ERROR: unable to extract video width')
 979             return
 980         yv_video_width = mobj.group(1)
 981
 982         # Retrieve video playlist to extract media URL
 983         # I'm not completely sure what all these options are, but we
 984         # seem to need most of them, otherwise the server sends a 401.
 985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 990         try:
 991             self.report_download_webpage(video_id)
 992             webpage = compat_urllib_request.urlopen(request).read()
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Extract media URL from playlist XML
 998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 self._downloader.trouble(u'[youtube] No video results')
1487                 return
1488
1489             new_ids = list(video['id'] for video in api_response['items'])
1490             video_ids += new_ids
1491
1492             limit = min(n, api_response['totalItems'])
1493             pagenum += 1
1494
1495         if len(video_ids) > n:
1496             video_ids = video_ids[:n]
1497         for id in video_ids:
1498             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499         return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503     """Information Extractor for Google Video search queries."""
1504     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508     _max_google_results = 1000
1509     IE_NAME = u'video.google:search'
1510
1511     def __init__(self, downloader=None):
1512         InfoExtractor.__init__(self, downloader)
1513
1514     def report_download_page(self, query, pagenum):
1515         """Report attempt to download playlist page with given number."""
1516         query = query.decode(preferredencoding())
1517         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519     def _real_extract(self, query):
1520         mobj = re.match(self._VALID_URL, query)
1521         if mobj is None:
1522             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523             return
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_google_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539                     return
1540                 elif n > self._max_google_results:
1541                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542                     n = self._max_google_results
1543                 self._download_n_results(query, n)
1544                 return
1545             except ValueError: # parsing prefix as integer fails
1546                 self._download_n_results(query, 1)
1547                 return
1548
1549     def _download_n_results(self, query, n):
1550         """Downloads a specified number of results for a query"""
1551
1552         video_ids = []
1553         pagenum = 0
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in video_ids:
1569                     video_ids.append(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585     """Information Extractor for Yahoo! Video search queries."""
1586
1587     _WORKING = False
1588     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591     _MORE_PAGES_INDICATOR = r'\s*Next'
1592     _max_yahoo_results = 1000
1593     IE_NAME = u'video.yahoo:search'
1594
1595     def __init__(self, downloader=None):
1596         InfoExtractor.__init__(self, downloader)
1597
1598     def report_download_page(self, query, pagenum):
1599         """Report attempt to download playlist page with given number."""
1600         query = query.decode(preferredencoding())
1601         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603     def _real_extract(self, query):
1604         mobj = re.match(self._VALID_URL, query)
1605         if mobj is None:
1606             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607             return
1608
1609         prefix, query = query.split(':')
1610         prefix = prefix[8:]
1611         query = query.encode('utf-8')
1612         if prefix == '':
1613             self._download_n_results(query, 1)
1614             return
1615         elif prefix == 'all':
1616             self._download_n_results(query, self._max_yahoo_results)
1617             return
1618         else:
1619             try:
1620                 n = int(prefix)
1621                 if n <= 0:
1622                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                     return
1624                 elif n > self._max_yahoo_results:
1625                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626                     n = self._max_yahoo_results
1627                 self._download_n_results(query, n)
1628                 return
1629             except ValueError: # parsing prefix as integer fails
1630                 self._download_n_results(query, 1)
1631                 return
1632
1633     def _download_n_results(self, query, n):
1634         """Downloads a specified number of results for a query"""
1635
1636         video_ids = []
1637         already_seen = set()
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(query, pagenum)
1642             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643             request = compat_urllib_request.Request(result_url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read()
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                 video_id = mobj.group(1)
1653                 if video_id not in already_seen:
1654                     video_ids.append(video_id)
1655                     already_seen.add(video_id)
1656                     if len(video_ids) == n:
1657                         # Specified n videos reached
1658                         for id in video_ids:
1659                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                         return
1661
1662             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                 for id in video_ids:
1664                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665                 return
1666
1667             pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671     """Information Extractor for YouTube playlists."""
1672
1673     _VALID_URL = r"""(?:
1674                         (?:https?://)?
1675                         (?:\w+\.)?
1676                         youtube\.com/
1677                         (?:
1678                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679                            \? (?:.*?&)*? (?:p|a|list)=
1680                         |  user/.*?/user/
1681                         |  p/
1682                         |  user/.*?#[pg]/c/
1683                         )
1684                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685                         .*
1686                      |
1687                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688                      )"""
1689     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690     _MAX_RESULTS = 50
1691     IE_NAME = u'youtube:playlist'
1692
1693     def __init__(self, downloader=None):
1694         InfoExtractor.__init__(self, downloader)
1695
1696     @classmethod
1697     def suitable(cls, url):
1698         """Receives a URL and returns True if suitable for this IE."""
1699         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701     def report_download_page(self, playlist_id, pagenum):
1702         """Report attempt to download playlist page with given number."""
1703         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract playlist id
1707         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download playlist videos from API
1713         playlist_id = mobj.group(1) or mobj.group(2)
1714         page_num = 1
1715         videos = []
1716
1717         while True:
1718             self.report_download_page(playlist_id, page_num)
1719
1720             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721             try:
1722                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if not 'feed' in response or not 'entry' in response['feed']:
1734                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735                 return
1736             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737                         for entry in response['feed']['entry']
1738                         if 'content' in entry ]
1739
1740             if len(response['feed']['entry']) < self._MAX_RESULTS:
1741                 break
1742             page_num += 1
1743
1744         videos = [v[1] for v in sorted(videos)]
1745         total = len(videos)
1746
1747         playliststart = self._downloader.params.get('playliststart', 1) - 1
1748         playlistend = self._downloader.params.get('playlistend', -1)
1749         if playlistend == -1:
1750             videos = videos[playliststart:]
1751         else:
1752             videos = videos[playliststart:playlistend]
1753
1754         if len(videos) == total:
1755             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1756         else:
1757             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1758
1759         return self._downloader.extract_info_iterable(videos)
1760
1761
1762 class YoutubeChannelIE(InfoExtractor):
1763     """Information Extractor for YouTube channels."""
1764
1765     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1766     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1767     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1768     IE_NAME = u'youtube:channel'
1769
1770     def report_download_page(self, channel_id, pagenum):
1771         """Report attempt to download channel page with given number."""
1772         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1773
1774     def _real_extract(self, url):
1775         # Extract channel id
1776         mobj = re.match(self._VALID_URL, url)
1777         if mobj is None:
1778             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1779             return
1780
1781         # Download channel pages
1782         channel_id = mobj.group(1)
1783         video_ids = []
1784         pagenum = 1
1785
1786         while True:
1787             self.report_download_page(channel_id, pagenum)
1788             url = self._TEMPLATE_URL % (channel_id, pagenum)
1789             request = compat_urllib_request.Request(url)
1790             try:
1791                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794                 return
1795
1796             # Extract video identifiers
1797             ids_in_page = []
1798             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1799                 if mobj.group(1) not in ids_in_page:
1800                     ids_in_page.append(mobj.group(1))
1801             video_ids.extend(ids_in_page)
1802
1803             if self._MORE_PAGES_INDICATOR not in page:
1804                 break
1805             pagenum = pagenum + 1
1806
1807         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808
1809         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1810         return self._downloader.extract_info_iterable(urls)
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814     """Information Extractor for YouTube users."""
1815
1816     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818     _GDATA_PAGE_SIZE = 50
1819     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821     IE_NAME = u'youtube:user'
1822
1823     def __init__(self, downloader=None):
1824         InfoExtractor.__init__(self, downloader)
1825
1826     def report_download_page(self, username, start_index):
1827         """Report attempt to download user page."""
1828         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1829                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1830
1831     def _real_extract(self, url):
1832         # Extract username
1833         mobj = re.match(self._VALID_URL, url)
1834         if mobj is None:
1835             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1836             return
1837
1838         username = mobj.group(1)
1839
1840         # Download video ids using YouTube Data API. Result size per
1841         # query is limited (currently to 50 videos) so we need to query
1842         # page by page until there are no video ids - it means we got
1843         # all of them.
1844
1845         video_ids = []
1846         pagenum = 0
1847
1848         while True:
1849             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1850             self.report_download_page(username, start_index)
1851
1852             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1853
1854             try:
1855                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1858                 return
1859
1860             # Extract video identifiers
1861             ids_in_page = []
1862
1863             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864                 if mobj.group(1) not in ids_in_page:
1865                     ids_in_page.append(mobj.group(1))
1866
1867             video_ids.extend(ids_in_page)
1868
1869             # A little optimization - if current page is not
1870             # "full", ie. does not contain PAGE_SIZE video ids then
1871             # we can assume that this page is the last one - there
1872             # are no more ids on further pages - no need to query
1873             # again.
1874
1875             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1876                 break
1877
1878             pagenum += 1
1879
1880         all_ids_count = len(video_ids)
1881         playliststart = self._downloader.params.get('playliststart', 1) - 1
1882         playlistend = self._downloader.params.get('playlistend', -1)
1883
1884         if playlistend == -1:
1885             video_ids = video_ids[playliststart:]
1886         else:
1887             video_ids = video_ids[playliststart:playlistend]
1888
1889         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1890                 (username, all_ids_count, len(video_ids)))
1891
1892         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1893         return self._downloader.extract_info_iterable(urls)
1894
1895
1896 class BlipTVUserIE(InfoExtractor):
1897     """Information Extractor for blip.tv users."""
1898
1899     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1900     _PAGE_SIZE = 12
1901     IE_NAME = u'blip.tv:user'
1902
1903     def __init__(self, downloader=None):
1904         InfoExtractor.__init__(self, downloader)
1905
1906     def report_download_page(self, username, pagenum):
1907         """Report attempt to download user page."""
1908         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1909                 (self.IE_NAME, username, pagenum))
1910
1911     def _real_extract(self, url):
1912         # Extract username
1913         mobj = re.match(self._VALID_URL, url)
1914         if mobj is None:
1915             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1916             return
1917
1918         username = mobj.group(1)
1919
1920         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1921
1922         request = compat_urllib_request.Request(url)
1923
1924         try:
1925             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926             mobj = re.search(r'data-users-id="([^"]+)"', page)
1927             page_base = page_base % mobj.group(1)
1928         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1930             return
1931
1932
1933         # Download video ids using BlipTV Ajax calls. Result size per
1934         # query is limited (currently to 12 videos) so we need to query
1935         # page by page until there are no video ids - it means we got
1936         # all of them.
1937
1938         video_ids = []
1939         pagenum = 1
1940
1941         while True:
1942             self.report_download_page(username, pagenum)
1943             url = page_base + "&page=" + str(pagenum)
1944             request = compat_urllib_request.Request( url )
1945             try:
1946                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1949                 return
1950
1951             # Extract video identifiers
1952             ids_in_page = []
1953
1954             for mobj in re.finditer(r'href="/([^"]+)"', page):
1955                 if mobj.group(1) not in ids_in_page:
1956                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1957
1958             video_ids.extend(ids_in_page)
1959
1960             # A little optimization - if current page is not
1961             # "full", ie. does not contain PAGE_SIZE video ids then
1962             # we can assume that this page is the last one - there
1963             # are no more ids on further pages - no need to query
1964             # again.
1965
1966             if len(ids_in_page) < self._PAGE_SIZE:
1967                 break
1968
1969             pagenum += 1
1970
1971         all_ids_count = len(video_ids)
1972         playliststart = self._downloader.params.get('playliststart', 1) - 1
1973         playlistend = self._downloader.params.get('playlistend', -1)
1974
1975         if playlistend == -1:
1976             video_ids = video_ids[playliststart:]
1977         else:
1978             video_ids = video_ids[playliststart:playlistend]
1979
1980         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1981                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982
1983         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1984         return self._downloader.extract_info_iterable(urls)
1985
1986
1987 class DepositFilesIE(InfoExtractor):
1988     """Information extractor for depositfiles.com"""
1989
1990     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991
1992     def report_download_webpage(self, file_id):
1993         """Report webpage download."""
1994         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995
1996     def report_extraction(self, file_id):
1997         """Report information extraction."""
1998         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999
2000     def _real_extract(self, url):
2001         file_id = url.split('/')[-1]
2002         # Rebuild url in english locale
2003         url = 'http://depositfiles.com/en/files/' + file_id
2004
2005         # Retrieve file webpage with 'Free download' button pressed
2006         free_download_indication = { 'gateway_result' : '1' }
2007         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008         try:
2009             self.report_download_webpage(file_id)
2010             webpage = compat_urllib_request.urlopen(request).read()
2011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2013             return
2014
2015         # Search for the real file URL
2016         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2017         if (mobj is None) or (mobj.group(1) is None):
2018             # Try to figure out reason of the error.
2019             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2020             if (mobj is not None) and (mobj.group(1) is not None):
2021                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2022                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023             else:
2024                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2025             return
2026
2027         file_url = mobj.group(1)
2028         file_extension = os.path.splitext(file_url)[1][1:]
2029
2030         # Search for file title
2031         mobj = re.search(r'<b title="(.*?)">', webpage)
2032         if mobj is None:
2033             self._downloader.trouble(u'ERROR: unable to extract title')
2034             return
2035         file_title = mobj.group(1).decode('utf-8')
2036
2037         return [{
2038             'id':       file_id.decode('utf-8'),
2039             'url':      file_url.decode('utf-8'),
2040             'uploader': None,
2041             'upload_date':  None,
2042             'title':    file_title,
2043             'ext':      file_extension.decode('utf-8'),
2044         }]
2045
2046
2047 class FacebookIE(InfoExtractor):
2048     """Information Extractor for Facebook"""
2049
2050     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2051     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2052     _NETRC_MACHINE = 'facebook'
2053     IE_NAME = u'facebook'
2054
2055     def report_login(self):
2056         """Report attempt to log in."""
2057         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058
2059     def _real_initialize(self):
2060         if self._downloader is None:
2061             return
2062
2063         useremail = None
2064         password = None
2065         downloader_params = self._downloader.params
2066
2067         # Attempt to use provided username and password or .netrc data
2068         if downloader_params.get('username', None) is not None:
2069             useremail = downloader_params['username']
2070             password = downloader_params['password']
2071         elif downloader_params.get('usenetrc', False):
2072             try:
2073                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2074                 if info is not None:
2075                     useremail = info[0]
2076                     password = info[2]
2077                 else:
2078                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2079             except (IOError, netrc.NetrcParseError) as err:
2080                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2081                 return
2082
2083         if useremail is None:
2084             return
2085
2086         # Log in
2087         login_form = {
2088             'email': useremail,
2089             'pass': password,
2090             'login': 'Log+In'
2091             }
2092         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2093         try:
2094             self.report_login()
2095             login_results = compat_urllib_request.urlopen(request).read()
2096             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2097                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098                 return
2099         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2100             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2101             return
2102
2103     def _real_extract(self, url):
2104         mobj = re.match(self._VALID_URL, url)
2105         if mobj is None:
2106             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107             return
2108         video_id = mobj.group('ID')
2109
2110         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2111         webpage = self._download_webpage(url, video_id)
2112
2113         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2114         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2115         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116         if not m:
2117             raise ExtractorError(u'Cannot parse data')
2118         data = dict(json.loads(m.group(1)))
2119         params_raw = compat_urllib_parse.unquote(data['params'])
2120         params = json.loads(params_raw)
2121         video_url = params['hd_src']
2122         if not video_url:
2123             video_url = params['sd_src']
2124         if not video_url:
2125             raise ExtractorError(u'Cannot find video URL')
2126         video_duration = int(params['video_duration'])
2127
2128         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129         if not m:
2130             raise ExtractorError(u'Cannot find title in webpage')
2131         video_title = unescapeHTML(m.group(1))
2132
2133         info = {
2134             'id': video_id,
2135             'title': video_title,
2136             'url': video_url,
2137             'ext': 'mp4',
2138             'duration': video_duration,
2139             'thumbnail': params['thumbnail_src'],
2140         }
2141         return [info]
2142
2143
2144 class BlipTVIE(InfoExtractor):
2145     """Information extractor for blip.tv"""
2146
2147     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149     IE_NAME = u'blip.tv'
2150
2151     def report_extraction(self, file_id):
2152         """Report information extraction."""
2153         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155     def report_direct_download(self, title):
2156         """Report information extraction."""
2157         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159     def _real_extract(self, url):
2160         mobj = re.match(self._VALID_URL, url)
2161         if mobj is None:
2162             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163             return
2164
2165         urlp = compat_urllib_parse_urlparse(url)
2166         if urlp.path.startswith('/play/'):
2167             request = compat_urllib_request.Request(url)
2168             response = compat_urllib_request.urlopen(request)
2169             redirecturl = response.geturl()
2170             rurlp = compat_urllib_parse_urlparse(redirecturl)
2171             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2172             url = 'http://blip.tv/a/a-' + file_id
2173             return self._real_extract(url)
2174
2175
2176         if '?' in url:
2177             cchar = '&'
2178         else:
2179             cchar = '?'
2180         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2181         request = compat_urllib_request.Request(json_url)
2182         request.add_header('User-Agent', 'iTunes/10.6.1')
2183         self.report_extraction(mobj.group(1))
2184         info = None
2185         try:
2186             urlh = compat_urllib_request.urlopen(request)
2187             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2188                 basename = url.split('/')[-1]
2189                 title,ext = os.path.splitext(basename)
2190                 title = title.decode('UTF-8')
2191                 ext = ext.replace('.', '')
2192                 self.report_direct_download(title)
2193                 info = {
2194                     'id': title,
2195                     'url': url,
2196                     'uploader': None,
2197                     'upload_date': None,
2198                     'title': title,
2199                     'ext': ext,
2200                     'urlhandle': urlh
2201                 }
2202         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2203             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2204         if info is None: # Regular URL
2205             try:
2206                 json_code_bytes = urlh.read()
2207                 json_code = json_code_bytes.decode('utf-8')
2208             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2209                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2210                 return
2211
2212             try:
2213                 json_data = json.loads(json_code)
2214                 if 'Post' in json_data:
2215                     data = json_data['Post']
2216                 else:
2217                     data = json_data
2218
2219                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2220                 video_url = data['media']['url']
2221                 umobj = re.match(self._URL_EXT, video_url)
2222                 if umobj is None:
2223                     raise ValueError('Can not determine filename extension')
2224                 ext = umobj.group(1)
2225
2226                 info = {
2227                     'id': data['item_id'],
2228                     'url': video_url,
2229                     'uploader': data['display_name'],
2230                     'upload_date': upload_date,
2231                     'title': data['title'],
2232                     'ext': ext,
2233                     'format': data['media']['mimeType'],
2234                     'thumbnail': data['thumbnailUrl'],
2235                     'description': data['description'],
2236                     'player_url': data['embedUrl'],
2237                     'user_agent': 'iTunes/10.6.1',
2238                 }
2239             except (ValueError,KeyError) as err:
2240                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2241                 return
2242
2243         return [info]
2244
2245
2246 class MyVideoIE(InfoExtractor):
2247     """Information Extractor for myvideo.de."""
2248
2249     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2250     IE_NAME = u'myvideo'
2251
2252     def __init__(self, downloader=None):
2253         InfoExtractor.__init__(self, downloader)
2254
2255     def report_extraction(self, video_id):
2256         """Report information extraction."""
2257         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2258
2259     def _real_extract(self,url):
2260         mobj = re.match(self._VALID_URL, url)
2261         if mobj is None:
2262             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2263             return
2264
2265         video_id = mobj.group(1)
2266
2267         # Get video webpage
2268         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2269         webpage = self._download_webpage(webpage_url, video_id)
2270
2271         self.report_extraction(video_id)
2272         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2273                  webpage)
2274         if mobj is None:
2275             self._downloader.trouble(u'ERROR: unable to extract media URL')
2276             return
2277         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2278
2279         mobj = re.search('<title>([^<]+)</title>', webpage)
2280         if mobj is None:
2281             self._downloader.trouble(u'ERROR: unable to extract title')
2282             return
2283
2284         video_title = mobj.group(1)
2285
2286         return [{
2287             'id':       video_id,
2288             'url':      video_url,
2289             'uploader': None,
2290             'upload_date':  None,
2291             'title':    video_title,
2292             'ext':      u'flv',
2293         }]
2294
2295 class ComedyCentralIE(InfoExtractor):
2296     """Information extractor for The Daily Show and Colbert Report """
2297
2298     # urls can be abbreviations like :thedailyshow or :colbert
2299     # urls for episodes like:
2300     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2301     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2302     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2303     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2304                       |(https?://)?(www\.)?
2305                           (?P<showname>thedailyshow|colbertnation)\.com/
2306                          (full-episodes/(?P<episode>.*)|
2307                           (?P<clip>
2308                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2309                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2310                      $"""
2311
2312     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2313
2314     _video_extensions = {
2315         '3500': 'mp4',
2316         '2200': 'mp4',
2317         '1700': 'mp4',
2318         '1200': 'mp4',
2319         '750': 'mp4',
2320         '400': 'mp4',
2321     }
2322     _video_dimensions = {
2323         '3500': '1280x720',
2324         '2200': '960x540',
2325         '1700': '768x432',
2326         '1200': '640x360',
2327         '750': '512x288',
2328         '400': '384x216',
2329     }
2330
2331     @classmethod
2332     def suitable(cls, url):
2333         """Receives a URL and returns True if suitable for this IE."""
2334         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2335
2336     def report_extraction(self, episode_id):
2337         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2338
2339     def report_config_download(self, episode_id, media_id):
2340         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2341
2342     def report_index_download(self, episode_id):
2343         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2344
2345     def _print_formats(self, formats):
2346         print('Available formats:')
2347         for x in formats:
2348             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2349
2350
2351     def _real_extract(self, url):
2352         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2353         if mobj is None:
2354             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2355             return
2356
2357         if mobj.group('shortname'):
2358             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2359                 url = u'http://www.thedailyshow.com/full-episodes/'
2360             else:
2361                 url = u'http://www.colbertnation.com/full-episodes/'
2362             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2363             assert mobj is not None
2364
2365         if mobj.group('clip'):
2366             if mobj.group('showname') == 'thedailyshow':
2367                 epTitle = mobj.group('tdstitle')
2368             else:
2369                 epTitle = mobj.group('cntitle')
2370             dlNewest = False
2371         else:
2372             dlNewest = not mobj.group('episode')
2373             if dlNewest:
2374                 epTitle = mobj.group('showname')
2375             else:
2376                 epTitle = mobj.group('episode')
2377
2378         req = compat_urllib_request.Request(url)
2379         self.report_extraction(epTitle)
2380         try:
2381             htmlHandle = compat_urllib_request.urlopen(req)
2382             html = htmlHandle.read()
2383             webpage = html.decode('utf-8')
2384         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2385             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2386             return
2387         if dlNewest:
2388             url = htmlHandle.geturl()
2389             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2390             if mobj is None:
2391                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2392                 return
2393             if mobj.group('episode') == '':
2394                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2395                 return
2396             epTitle = mobj.group('episode')
2397
2398         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2399
2400         if len(mMovieParams) == 0:
2401             # The Colbert Report embeds the information in a without
2402             # a URL prefix; so extract the alternate reference
2403             # and then add the URL prefix manually.
2404
2405             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2406             if len(altMovieParams) == 0:
2407                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2408                 return
2409             else:
2410                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2411
2412         uri = mMovieParams[0][1]
2413         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2414         self.report_index_download(epTitle)
2415         try:
2416             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2417         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2418             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2419             return
2420
2421         results = []
2422
2423         idoc = xml.etree.ElementTree.fromstring(indexXml)
2424         itemEls = idoc.findall('.//item')
2425         for partNum,itemEl in enumerate(itemEls):
2426             mediaId = itemEl.findall('./guid')[0].text
2427             shortMediaId = mediaId.split(':')[-1]
2428             showId = mediaId.split(':')[-2].replace('.com', '')
2429             officialTitle = itemEl.findall('./title')[0].text
2430             officialDate = itemEl.findall('./pubDate')[0].text
2431
2432             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2433                         compat_urllib_parse.urlencode({'uri': mediaId}))
2434             configReq = compat_urllib_request.Request(configUrl)
2435             self.report_config_download(epTitle, shortMediaId)
2436             try:
2437                 configXml = compat_urllib_request.urlopen(configReq).read()
2438             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2439                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2440                 return
2441
2442             cdoc = xml.etree.ElementTree.fromstring(configXml)
2443             turls = []
2444             for rendition in cdoc.findall('.//rendition'):
2445                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2446                 turls.append(finfo)
2447
2448             if len(turls) == 0:
2449                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2450                 continue
2451
2452             if self._downloader.params.get('listformats', None):
2453                 self._print_formats([i[0] for i in turls])
2454                 return
2455
2456             # For now, just pick the highest bitrate
2457             format,rtmp_video_url = turls[-1]
2458
2459             # Get the format arg from the arg stream
2460             req_format = self._downloader.params.get('format', None)
2461
2462             # Select format if we can find one
2463             for f,v in turls:
2464                 if f == req_format:
2465                     format, rtmp_video_url = f, v
2466                     break
2467
2468             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2469             if not m:
2470                 raise ExtractorError(u'Cannot transform RTMP url')
2471             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2472             video_url = base + m.group('finalid')
2473
2474             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2475             info = {
2476                 'id': shortMediaId,
2477                 'url': video_url,
2478                 'uploader': showId,
2479                 'upload_date': officialDate,
2480                 'title': effTitle,
2481                 'ext': 'mp4',
2482                 'format': format,
2483                 'thumbnail': None,
2484                 'description': officialTitle,
2485             }
2486             results.append(info)
2487
2488         return results
2489
2490
2491 class EscapistIE(InfoExtractor):
2492     """Information extractor for The Escapist """
2493
2494     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2495     IE_NAME = u'escapist'
2496
2497     def report_extraction(self, showName):
2498         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2499
2500     def report_config_download(self, showName):
2501         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2502
2503     def _real_extract(self, url):
2504         mobj = re.match(self._VALID_URL, url)
2505         if mobj is None:
2506             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507             return
2508         showName = mobj.group('showname')
2509         videoId = mobj.group('episode')
2510
2511         self.report_extraction(showName)
2512         try:
2513             webPage = compat_urllib_request.urlopen(url)
2514             webPageBytes = webPage.read()
2515             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2516             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2517         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2518             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2519             return
2520
2521         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2522         description = unescapeHTML(descMatch.group(1))
2523         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2524         imgUrl = unescapeHTML(imgMatch.group(1))
2525         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2526         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2527         configUrlMatch = re.search('config=(.*)$', playerUrl)
2528         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2529
2530         self.report_config_download(showName)
2531         try:
2532             configJSON = compat_urllib_request.urlopen(configUrl)
2533             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2534             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2535         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2536             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2537             return
2538
2539         # Technically, it's JavaScript, not JSON
2540         configJSON = configJSON.replace("'", '"')
2541
2542         try:
2543             config = json.loads(configJSON)
2544         except (ValueError,) as err:
2545             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2546             return
2547
2548         playlist = config['playlist']
2549         videoUrl = playlist[1]['url']
2550
2551         info = {
2552             'id': videoId,
2553             'url': videoUrl,
2554             'uploader': showName,
2555             'upload_date': None,
2556             'title': showName,
2557             'ext': 'flv',
2558             'thumbnail': imgUrl,
2559             'description': description,
2560             'player_url': playerUrl,
2561         }
2562
2563         return [info]
2564
2565 class CollegeHumorIE(InfoExtractor):
2566     """Information extractor for collegehumor.com"""
2567
2568     _WORKING = False
2569     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2570     IE_NAME = u'collegehumor'
2571
2572     def report_manifest(self, video_id):
2573         """Report information extraction."""
2574         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2575
2576     def report_extraction(self, video_id):
2577         """Report information extraction."""
2578         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2579
2580     def _real_extract(self, url):
2581         mobj = re.match(self._VALID_URL, url)
2582         if mobj is None:
2583             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2584             return
2585         video_id = mobj.group('videoid')
2586
2587         info = {
2588             'id': video_id,
2589             'uploader': None,
2590             'upload_date': None,
2591         }
2592
2593         self.report_extraction(video_id)
2594         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2595         try:
2596             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2598             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2599             return
2600
2601         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2602         try:
2603             videoNode = mdoc.findall('./video')[0]
2604             info['description'] = videoNode.findall('./description')[0].text
2605             info['title'] = videoNode.findall('./caption')[0].text
2606             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2607             manifest_url = videoNode.findall('./file')[0].text
2608         except IndexError:
2609             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2610             return
2611
2612         manifest_url += '?hdcore=2.10.3'
2613         self.report_manifest(video_id)
2614         try:
2615             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2616         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2617             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2618             return
2619
2620         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2621         try:
2622             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2623             node_id = media_node.attrib['url']
2624             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2625         except IndexError as err:
2626             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2627             return
2628
2629         url_pr = compat_urllib_parse_urlparse(manifest_url)
2630         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2631
2632         info['url'] = url
2633         info['ext'] = 'f4f'
2634         return [info]
2635
2636
2637 class XVideosIE(InfoExtractor):
2638     """Information extractor for xvideos.com"""
2639
2640     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2641     IE_NAME = u'xvideos'
2642
2643     def report_extraction(self, video_id):
2644         """Report information extraction."""
2645         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2646
2647     def _real_extract(self, url):
2648         mobj = re.match(self._VALID_URL, url)
2649         if mobj is None:
2650             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2651             return
2652         video_id = mobj.group(1)
2653
2654         webpage = self._download_webpage(url, video_id)
2655
2656         self.report_extraction(video_id)
2657
2658
2659         # Extract video URL
2660         mobj = re.search(r'flv_url=(.+?)&', webpage)
2661         if mobj is None:
2662             self._downloader.trouble(u'ERROR: unable to extract video url')
2663             return
2664         video_url = compat_urllib_parse.unquote(mobj.group(1))
2665
2666
2667         # Extract title
2668         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2669         if mobj is None:
2670             self._downloader.trouble(u'ERROR: unable to extract video title')
2671             return
2672         video_title = mobj.group(1)
2673
2674
2675         # Extract video thumbnail
2676         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2677         if mobj is None:
2678             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2679             return
2680         video_thumbnail = mobj.group(0)
2681
2682         info = {
2683             'id': video_id,
2684             'url': video_url,
2685             'uploader': None,
2686             'upload_date': None,
2687             'title': video_title,
2688             'ext': 'flv',
2689             'thumbnail': video_thumbnail,
2690             'description': None,
2691         }
2692
2693         return [info]
2694
2695
2696 class SoundcloudIE(InfoExtractor):
2697     """Information extractor for soundcloud.com
2698        To access the media, the uid of the song and a stream token
2699        must be extracted from the page source and the script must make
2700        a request to media.soundcloud.com/crossdomain.xml. Then
2701        the media can be grabbed by requesting from an url composed
2702        of the stream token and uid
2703      """
2704
2705     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2706     IE_NAME = u'soundcloud'
2707
2708     def __init__(self, downloader=None):
2709         InfoExtractor.__init__(self, downloader)
2710
2711     def report_resolve(self, video_id):
2712         """Report information extraction."""
2713         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2714
2715     def report_extraction(self, video_id):
2716         """Report information extraction."""
2717         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2718
2719     def _real_extract(self, url):
2720         mobj = re.match(self._VALID_URL, url)
2721         if mobj is None:
2722             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2723             return
2724
2725         # extract uploader (which is in the url)
2726         uploader = mobj.group(1)
2727         # extract simple title (uploader + slug of song title)
2728         slug_title =  mobj.group(2)
2729         simple_title = uploader + u'-' + slug_title
2730
2731         self.report_resolve('%s/%s' % (uploader, slug_title))
2732
2733         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2734         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2735         request = compat_urllib_request.Request(resolv_url)
2736         try:
2737             info_json_bytes = compat_urllib_request.urlopen(request).read()
2738             info_json = info_json_bytes.decode('utf-8')
2739         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2740             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2741             return
2742
2743         info = json.loads(info_json)
2744         video_id = info['id']
2745         self.report_extraction('%s/%s' % (uploader, slug_title))
2746
2747         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2748         request = compat_urllib_request.Request(streams_url)
2749         try:
2750             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2751             stream_json = stream_json_bytes.decode('utf-8')
2752         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2753             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2754             return
2755
2756         streams = json.loads(stream_json)
2757         mediaURL = streams['http_mp3_128_url']
2758
2759         return [{
2760             'id':       info['id'],
2761             'url':      mediaURL,
2762             'uploader': info['user']['username'],
2763             'upload_date':  info['created_at'],
2764             'title':    info['title'],
2765             'ext':      u'mp3',
2766             'description': info['description'],
2767         }]
2768
2769
2770 class InfoQIE(InfoExtractor):
2771     """Information extractor for infoq.com"""
2772     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2773
2774     def report_extraction(self, video_id):
2775         """Report information extraction."""
2776         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2777
2778     def _real_extract(self, url):
2779         mobj = re.match(self._VALID_URL, url)
2780         if mobj is None:
2781             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2782             return
2783
2784         webpage = self._download_webpage(url, video_id=url)
2785         self.report_extraction(url)
2786
2787         # Extract video URL
2788         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2789         if mobj is None:
2790             self._downloader.trouble(u'ERROR: unable to extract video url')
2791             return
2792         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2793         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2794
2795         # Extract title
2796         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2797         if mobj is None:
2798             self._downloader.trouble(u'ERROR: unable to extract video title')
2799             return
2800         video_title = mobj.group(1)
2801
2802         # Extract description
2803         video_description = u'No description available.'
2804         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2805         if mobj is not None:
2806             video_description = mobj.group(1)
2807
2808         video_filename = video_url.split('/')[-1]
2809         video_id, extension = video_filename.split('.')
2810
2811         info = {
2812             'id': video_id,
2813             'url': video_url,
2814             'uploader': None,
2815             'upload_date': None,
2816             'title': video_title,
2817             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2818             'thumbnail': None,
2819             'description': video_description,
2820         }
2821
2822         return [info]
2823
2824 class MixcloudIE(InfoExtractor):
2825     """Information extractor for www.mixcloud.com"""
2826
2827     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2828     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2829     IE_NAME = u'mixcloud'
2830
2831     def __init__(self, downloader=None):
2832         InfoExtractor.__init__(self, downloader)
2833
2834     def report_download_json(self, file_id):
2835         """Report JSON download."""
2836         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2837
2838     def report_extraction(self, file_id):
2839         """Report information extraction."""
2840         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2841
2842     def get_urls(self, jsonData, fmt, bitrate='best'):
2843         """Get urls from 'audio_formats' section in json"""
2844         file_url = None
2845         try:
2846             bitrate_list = jsonData[fmt]
2847             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2848                 bitrate = max(bitrate_list) # select highest
2849
2850             url_list = jsonData[fmt][bitrate]
2851         except TypeError: # we have no bitrate info.
2852             url_list = jsonData[fmt]
2853         return url_list
2854
2855     def check_urls(self, url_list):
2856         """Returns 1st active url from list"""
2857         for url in url_list:
2858             try:
2859                 compat_urllib_request.urlopen(url)
2860                 return url
2861             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2862                 url = None
2863
2864         return None
2865
2866     def _print_formats(self, formats):
2867         print('Available formats:')
2868         for fmt in formats.keys():
2869             for b in formats[fmt]:
2870                 try:
2871                     ext = formats[fmt][b][0]
2872                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2873                 except TypeError: # we have no bitrate info
2874                     ext = formats[fmt][0]
2875                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2876                     break
2877
2878     def _real_extract(self, url):
2879         mobj = re.match(self._VALID_URL, url)
2880         if mobj is None:
2881             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2882             return
2883         # extract uploader & filename from url
2884         uploader = mobj.group(1).decode('utf-8')
2885         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2886
2887         # construct API request
2888         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2889         # retrieve .json file with links to files
2890         request = compat_urllib_request.Request(file_url)
2891         try:
2892             self.report_download_json(file_url)
2893             jsonData = compat_urllib_request.urlopen(request).read()
2894         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2896             return
2897
2898         # parse JSON
2899         json_data = json.loads(jsonData)
2900         player_url = json_data['player_swf_url']
2901         formats = dict(json_data['audio_formats'])
2902
2903         req_format = self._downloader.params.get('format', None)
2904         bitrate = None
2905
2906         if self._downloader.params.get('listformats', None):
2907             self._print_formats(formats)
2908             return
2909
2910         if req_format is None or req_format == 'best':
2911             for format_param in formats.keys():
2912                 url_list = self.get_urls(formats, format_param)
2913                 # check urls
2914                 file_url = self.check_urls(url_list)
2915                 if file_url is not None:
2916                     break # got it!
2917         else:
2918             if req_format not in formats:
2919                 self._downloader.trouble(u'ERROR: format is not available')
2920                 return
2921
2922             url_list = self.get_urls(formats, req_format)
2923             file_url = self.check_urls(url_list)
2924             format_param = req_format
2925
2926         return [{
2927             'id': file_id.decode('utf-8'),
2928             'url': file_url.decode('utf-8'),
2929             'uploader': uploader.decode('utf-8'),
2930             'upload_date': None,
2931             'title': json_data['name'],
2932             'ext': file_url.split('.')[-1].decode('utf-8'),
2933             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2934             'thumbnail': json_data['thumbnail_url'],
2935             'description': json_data['description'],
2936             'player_url': player_url.decode('utf-8'),
2937         }]
2938
2939 class StanfordOpenClassroomIE(InfoExtractor):
2940     """Information extractor for Stanford's Open ClassRoom"""
2941
2942     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2943     IE_NAME = u'stanfordoc'
2944
2945     def report_download_webpage(self, objid):
2946         """Report information extraction."""
2947         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2948
2949     def report_extraction(self, video_id):
2950         """Report information extraction."""
2951         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2952
2953     def _real_extract(self, url):
2954         mobj = re.match(self._VALID_URL, url)
2955         if mobj is None:
2956             raise ExtractorError(u'Invalid URL: %s' % url)
2957
2958         if mobj.group('course') and mobj.group('video'): # A specific video
2959             course = mobj.group('course')
2960             video = mobj.group('video')
2961             info = {
2962                 'id': course + '_' + video,
2963                 'uploader': None,
2964                 'upload_date': None,
2965             }
2966
2967             self.report_extraction(info['id'])
2968             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2969             xmlUrl = baseUrl + video + '.xml'
2970             try:
2971                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2972             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2973                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2974                 return
2975             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2976             try:
2977                 info['title'] = mdoc.findall('./title')[0].text
2978                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2979             except IndexError:
2980                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2981                 return
2982             info['ext'] = info['url'].rpartition('.')[2]
2983             return [info]
2984         elif mobj.group('course'): # A course page
2985             course = mobj.group('course')
2986             info = {
2987                 'id': course,
2988                 'type': 'playlist',
2989                 'uploader': None,
2990                 'upload_date': None,
2991             }
2992
2993             coursepage = self._download_webpage(url, info['id'],
2994                                         note='Downloading course info page',
2995                                         errnote='Unable to download course info page')
2996
2997             m = re.search('<h1>([^<]+)</h1>', coursepage)
2998             if m:
2999                 info['title'] = unescapeHTML(m.group(1))
3000             else:
3001                 info['title'] = info['id']
3002
3003             m = re.search('<description>([^<]+)</description>', coursepage)
3004             if m:
3005                 info['description'] = unescapeHTML(m.group(1))
3006
3007             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3008             info['list'] = [
3009                 {
3010                     'type': 'reference',
3011                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3012                 }
3013                     for vpage in links]
3014             results = []
3015             for entry in info['list']:
3016                 assert entry['type'] == 'reference'
3017                 results += self.extract(entry['url'])
3018             return results
3019         else: # Root page
3020             info = {
3021                 'id': 'Stanford OpenClassroom',
3022                 'type': 'playlist',
3023                 'uploader': None,
3024                 'upload_date': None,
3025             }
3026
3027             self.report_download_webpage(info['id'])
3028             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3029             try:
3030                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3031             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3033                 return
3034
3035             info['title'] = info['id']
3036
3037             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3038             info['list'] = [
3039                 {
3040                     'type': 'reference',
3041                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3042                 }
3043                     for cpage in links]
3044
3045             results = []
3046             for entry in info['list']:
3047                 assert entry['type'] == 'reference'
3048                 results += self.extract(entry['url'])
3049             return results
3050
3051 class MTVIE(InfoExtractor):
3052     """Information extractor for MTV.com"""
3053
3054     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3055     IE_NAME = u'mtv'
3056
3057     def report_extraction(self, video_id):
3058         """Report information extraction."""
3059         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3060
3061     def _real_extract(self, url):
3062         mobj = re.match(self._VALID_URL, url)
3063         if mobj is None:
3064             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3065             return
3066         if not mobj.group('proto'):
3067             url = 'http://' + url
3068         video_id = mobj.group('videoid')
3069
3070         webpage = self._download_webpage(url, video_id)
3071
3072         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3073         if mobj is None:
3074             self._downloader.trouble(u'ERROR: unable to extract song name')
3075             return
3076         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3077         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3078         if mobj is None:
3079             self._downloader.trouble(u'ERROR: unable to extract performer')
3080             return
3081         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082         video_title = performer + ' - ' + song_name
3083
3084         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3085         if mobj is None:
3086             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3087             return
3088         mtvn_uri = mobj.group(1)
3089
3090         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3091         if mobj is None:
3092             self._downloader.trouble(u'ERROR: unable to extract content id')
3093             return
3094         content_id = mobj.group(1)
3095
3096         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3097         self.report_extraction(video_id)
3098         request = compat_urllib_request.Request(videogen_url)
3099         try:
3100             metadataXml = compat_urllib_request.urlopen(request).read()
3101         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3102             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3103             return
3104
3105         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3106         renditions = mdoc.findall('.//rendition')
3107
3108         # For now, always pick the highest quality.
3109         rendition = renditions[-1]
3110
3111         try:
3112             _,_,ext = rendition.attrib['type'].partition('/')
3113             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3114             video_url = rendition.find('./src').text
3115         except KeyError:
3116             self._downloader.trouble('Invalid rendition field.')
3117             return
3118
3119         info = {
3120             'id': video_id,
3121             'url': video_url,
3122             'uploader': performer,
3123             'upload_date': None,
3124             'title': video_title,
3125             'ext': ext,
3126             'format': format,
3127         }
3128
3129         return [info]
3130
3131
3132 class YoukuIE(InfoExtractor):
3133     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3134
3135     def report_download_webpage(self, file_id):
3136         """Report webpage download."""
3137         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3138
3139     def report_extraction(self, file_id):
3140         """Report information extraction."""
3141         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3142
3143     def _gen_sid(self):
3144         nowTime = int(time.time() * 1000)
3145         random1 = random.randint(1000,1998)
3146         random2 = random.randint(1000,9999)
3147
3148         return "%d%d%d" %(nowTime,random1,random2)
3149
3150     def _get_file_ID_mix_string(self, seed):
3151         mixed = []
3152         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3153         seed = float(seed)
3154         for i in range(len(source)):
3155             seed  =  (seed * 211 + 30031 ) % 65536
3156             index  =  math.floor(seed / 65536 * len(source) )
3157             mixed.append(source[int(index)])
3158             source.remove(source[int(index)])
3159         #return ''.join(mixed)
3160         return mixed
3161
3162     def _get_file_id(self, fileId, seed):
3163         mixed = self._get_file_ID_mix_string(seed)
3164         ids = fileId.split('*')
3165         realId = []
3166         for ch in ids:
3167             if ch:
3168                 realId.append(mixed[int(ch)])
3169         return ''.join(realId)
3170
3171     def _real_extract(self, url):
3172         mobj = re.match(self._VALID_URL, url)
3173         if mobj is None:
3174             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3175             return
3176         video_id = mobj.group('ID')
3177
3178         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3179
3180         request = compat_urllib_request.Request(info_url, None, std_headers)
3181         try:
3182             self.report_download_webpage(video_id)
3183             jsondata = compat_urllib_request.urlopen(request).read()
3184         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3185             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3186             return
3187
3188         self.report_extraction(video_id)
3189         try:
3190             jsonstr = jsondata.decode('utf-8')
3191             config = json.loads(jsonstr)
3192
3193             video_title =  config['data'][0]['title']
3194             seed = config['data'][0]['seed']
3195
3196             format = self._downloader.params.get('format', None)
3197             supported_format = list(config['data'][0]['streamfileids'].keys())
3198
3199             if format is None or format == 'best':
3200                 if 'hd2' in supported_format:
3201                     format = 'hd2'
3202                 else:
3203                     format = 'flv'
3204                 ext = u'flv'
3205             elif format == 'worst':
3206                 format = 'mp4'
3207                 ext = u'mp4'
3208             else:
3209                 format = 'flv'
3210                 ext = u'flv'
3211
3212
3213             fileid = config['data'][0]['streamfileids'][format]
3214             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3215         except (UnicodeDecodeError, ValueError, KeyError):
3216             self._downloader.trouble(u'ERROR: unable to extract info section')
3217             return
3218
3219         files_info=[]
3220         sid = self._gen_sid()
3221         fileid = self._get_file_id(fileid, seed)
3222
3223         #column 8,9 of fileid represent the segment number
3224         #fileid[7:9] should be changed
3225         for index, key in enumerate(keys):
3226
3227             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3228             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3229
3230             info = {
3231                 'id': '%s_part%02d' % (video_id, index),
3232                 'url': download_url,
3233                 'uploader': None,
3234                 'upload_date': None,
3235                 'title': video_title,
3236                 'ext': ext,
3237             }
3238             files_info.append(info)
3239
3240         return files_info
3241
3242
3243 class XNXXIE(InfoExtractor):
3244     """Information extractor for xnxx.com"""
3245
3246     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3247     IE_NAME = u'xnxx'
3248     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3249     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3250     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3251
3252     def report_webpage(self, video_id):
3253         """Report information extraction"""
3254         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3255
3256     def report_extraction(self, video_id):
3257         """Report information extraction"""
3258         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3259
3260     def _real_extract(self, url):
3261         mobj = re.match(self._VALID_URL, url)
3262         if mobj is None:
3263             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3264             return
3265         video_id = mobj.group(1)
3266
3267         self.report_webpage(video_id)
3268
3269         # Get webpage content
3270         try:
3271             webpage_bytes = compat_urllib_request.urlopen(url).read()
3272             webpage = webpage_bytes.decode('utf-8')
3273         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3274             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3275             return
3276
3277         result = re.search(self.VIDEO_URL_RE, webpage)
3278         if result is None:
3279             self._downloader.trouble(u'ERROR: unable to extract video url')
3280             return
3281         video_url = compat_urllib_parse.unquote(result.group(1))
3282
3283         result = re.search(self.VIDEO_TITLE_RE, webpage)
3284         if result is None:
3285             self._downloader.trouble(u'ERROR: unable to extract video title')
3286             return
3287         video_title = result.group(1)
3288
3289         result = re.search(self.VIDEO_THUMB_RE, webpage)
3290         if result is None:
3291             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3292             return
3293         video_thumbnail = result.group(1)
3294
3295         return [{
3296             'id': video_id,
3297             'url': video_url,
3298             'uploader': None,
3299             'upload_date': None,
3300             'title': video_title,
3301             'ext': 'flv',
3302             'thumbnail': video_thumbnail,
3303             'description': None,
3304         }]
3305
3306
3307 class GooglePlusIE(InfoExtractor):
3308     """Information extractor for plus.google.com."""
3309
3310     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3311     IE_NAME = u'plus.google'
3312
3313     def __init__(self, downloader=None):
3314         InfoExtractor.__init__(self, downloader)
3315
3316     def report_extract_entry(self, url):
3317         """Report downloading extry"""
3318         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3319
3320     def report_date(self, upload_date):
3321         """Report downloading extry"""
3322         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3323
3324     def report_uploader(self, uploader):
3325         """Report downloading extry"""
3326         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3327
3328     def report_title(self, video_title):
3329         """Report downloading extry"""
3330         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3331
3332     def report_extract_vid_page(self, video_page):
3333         """Report information extraction."""
3334         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3335
3336     def _real_extract(self, url):
3337         # Extract id from URL
3338         mobj = re.match(self._VALID_URL, url)
3339         if mobj is None:
3340             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3341             return
3342
3343         post_url = mobj.group(0)
3344         video_id = mobj.group(1)
3345
3346         video_extension = 'flv'
3347
3348         # Step 1, Retrieve post webpage to extract further information
3349         self.report_extract_entry(post_url)
3350         request = compat_urllib_request.Request(post_url)
3351         try:
3352             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3354             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3355             return
3356
3357         # Extract update date
3358         upload_date = None
3359         pattern = 'title="Timestamp">(.*?)</a>'
3360         mobj = re.search(pattern, webpage)
3361         if mobj:
3362             upload_date = mobj.group(1)
3363             # Convert timestring to a format suitable for filename
3364             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3365             upload_date = upload_date.strftime('%Y%m%d')
3366         self.report_date(upload_date)
3367
3368         # Extract uploader
3369         uploader = None
3370         pattern = r'rel\="author".*?>(.*?)</a>'
3371         mobj = re.search(pattern, webpage)
3372         if mobj:
3373             uploader = mobj.group(1)
3374         self.report_uploader(uploader)
3375
3376         # Extract title
3377         # Get the first line for title
3378         video_title = u'NA'
3379         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3380         mobj = re.search(pattern, webpage)
3381         if mobj:
3382             video_title = mobj.group(1)
3383         self.report_title(video_title)
3384
3385         # Step 2, Stimulate clicking the image box to launch video
3386         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3387         mobj = re.search(pattern, webpage)
3388         if mobj is None:
3389             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3390
3391         video_page = mobj.group(1)
3392         request = compat_urllib_request.Request(video_page)
3393         try:
3394             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3395         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3396             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3397             return
3398         self.report_extract_vid_page(video_page)
3399
3400
3401         # Extract video links on video page
3402         """Extract video links of all sizes"""
3403         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3404         mobj = re.findall(pattern, webpage)
3405         if len(mobj) == 0:
3406             self._downloader.trouble(u'ERROR: unable to extract video links')
3407
3408         # Sort in resolution
3409         links = sorted(mobj)
3410
3411         # Choose the lowest of the sort, i.e. highest resolution
3412         video_url = links[-1]
3413         # Only get the url. The resolution part in the tuple has no use anymore
3414         video_url = video_url[-1]
3415         # Treat escaped \u0026 style hex
3416         try:
3417             video_url = video_url.decode("unicode_escape")
3418         except AttributeError: # Python 3
3419             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3420
3421
3422         return [{
3423             'id':       video_id,
3424             'url':      video_url,
3425             'uploader': uploader,
3426             'upload_date':  upload_date,
3427             'title':    video_title,
3428             'ext':      video_extension,
3429         }]
3430
3431 class NBAIE(InfoExtractor):
3432     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3433     IE_NAME = u'nba'
3434
3435     def _real_extract(self, url):
3436         mobj = re.match(self._VALID_URL, url)
3437         if mobj is None:
3438             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3439             return
3440
3441         video_id = mobj.group(1)
3442         if video_id.endswith('/index.html'):
3443             video_id = video_id[:-len('/index.html')]
3444
3445         webpage = self._download_webpage(url, video_id)
3446
3447         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3448         def _findProp(rexp, default=None):
3449             m = re.search(rexp, webpage)
3450             if m:
3451                 return unescapeHTML(m.group(1))
3452             else:
3453                 return default
3454
3455         shortened_video_id = video_id.rpartition('/')[2]
3456         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3457         info = {
3458             'id': shortened_video_id,
3459             'url': video_url,
3460             'ext': 'mp4',
3461             'title': title,
3462             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3463             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3464         }
3465         return [info]
3466
3467 class JustinTVIE(InfoExtractor):
3468     """Information extractor for justin.tv and twitch.tv"""
3469     # TODO: One broadcast may be split into multiple videos. The key
3470     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3471     # starts at 1 and increases. Can we treat all parts as one video?
3472
3473     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3474         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3475     _JUSTIN_PAGE_LIMIT = 100
3476     IE_NAME = u'justin.tv'
3477
3478     def report_extraction(self, file_id):
3479         """Report information extraction."""
3480         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3481
3482     def report_download_page(self, channel, offset):
3483         """Report attempt to download a single page of videos."""
3484         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3485                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3486
3487     # Return count of items, list of *valid* items
3488     def _parse_page(self, url):
3489         try:
3490             urlh = compat_urllib_request.urlopen(url)
3491             webpage_bytes = urlh.read()
3492             webpage = webpage_bytes.decode('utf-8', 'ignore')
3493         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3494             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3495             return
3496
3497         response = json.loads(webpage)
3498         if type(response) != list:
3499             error_text = response.get('error', 'unknown error')
3500             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3501             return
3502         info = []
3503         for clip in response:
3504             video_url = clip['video_file_url']
3505             if video_url:
3506                 video_extension = os.path.splitext(video_url)[1][1:]
3507                 video_date = re.sub('-', '', clip['start_time'][:10])
3508                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3509                 video_id = clip['id']
3510                 video_title = clip.get('title', video_id)
3511                 info.append({
3512                     'id': video_id,
3513                     'url': video_url,
3514                     'title': video_title,
3515                     'uploader': clip.get('channel_name', video_uploader_id),
3516                     'uploader_id': video_uploader_id,
3517                     'upload_date': video_date,
3518                     'ext': video_extension,
3519                 })
3520         return (len(response), info)
3521
3522     def _real_extract(self, url):
3523         mobj = re.match(self._VALID_URL, url)
3524         if mobj is None:
3525             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3526             return
3527
3528         api = 'http://api.justin.tv'
3529         video_id = mobj.group(mobj.lastindex)
3530         paged = False
3531         if mobj.lastindex == 1:
3532             paged = True
3533             api += '/channel/archives/%s.json'
3534         else:
3535             api += '/broadcast/by_archive/%s.json'
3536         api = api % (video_id,)
3537
3538         self.report_extraction(video_id)
3539
3540         info = []
3541         offset = 0
3542         limit = self._JUSTIN_PAGE_LIMIT
3543         while True:
3544             if paged:
3545                 self.report_download_page(video_id, offset)
3546             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3547             page_count, page_info = self._parse_page(page_url)
3548             info.extend(page_info)
3549             if not paged or page_count != limit:
3550                 break
3551             offset += limit
3552         return info
3553
3554 class FunnyOrDieIE(InfoExtractor):
3555     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3556
3557     def _real_extract(self, url):
3558         mobj = re.match(self._VALID_URL, url)
3559         if mobj is None:
3560             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3561             return
3562
3563         video_id = mobj.group('id')
3564         webpage = self._download_webpage(url, video_id)
3565
3566         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3567         if not m:
3568             self._downloader.trouble(u'ERROR: unable to find video information')
3569         video_url = unescapeHTML(m.group('url'))
3570
3571         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3572         if not m:
3573             self._downloader.trouble(u'Cannot find video title')
3574         title = unescapeHTML(m.group('title'))
3575
3576         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3577         if m:
3578             desc = unescapeHTML(m.group('desc'))
3579         else:
3580             desc = None
3581
3582         info = {
3583             'id': video_id,
3584             'url': video_url,
3585             'ext': 'mp4',
3586             'title': title,
3587             'description': desc,
3588         }
3589         return [info]
3590
3591 class SteamIE(InfoExtractor):
3592     _VALID_URL = r"""http://store.steampowered.com/
3593                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3594                 (?P<gameID>\d+)/?
3595                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3596                 """
3597
3598     @classmethod
3599     def suitable(cls, url):
3600         """Receives a URL and returns True if suitable for this IE."""
3601         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3602
3603     def _real_extract(self, url):
3604         m = re.match(self._VALID_URL, url, re.VERBOSE)
3605         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3606         gameID = m.group('gameID')
3607         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3608         webpage = self._download_webpage(videourl, gameID)
3609         mweb = re.finditer(urlRE, webpage)
3610         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3611         titles = re.finditer(namesRE, webpage)
3612         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3613         thumbs = re.finditer(thumbsRE, webpage)
3614         videos = []
3615         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3616             video_id = vid.group('videoID')
3617             title = vtitle.group('videoName')
3618             video_url = vid.group('videoURL')
3619             video_thumb = thumb.group('thumbnail')
3620             if not video_url:
3621                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3622             info = {
3623                 'id':video_id,
3624                 'url':video_url,
3625                 'ext': 'flv',
3626                 'title': unescapeHTML(title),
3627                 'thumbnail': video_thumb
3628                   }
3629             videos.append(info)
3630         return videos
3631
3632 class UstreamIE(InfoExtractor):
3633     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3634     IE_NAME = u'ustream'
3635
3636     def _real_extract(self, url):
3637         m = re.match(self._VALID_URL, url)
3638         video_id = m.group('videoID')
3639         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3640         webpage = self._download_webpage(url, video_id)
3641         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3642         title = m.group('title')
3643         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3644         uploader = m.group('uploader')
3645         info = {
3646                 'id':video_id,
3647                 'url':video_url,
3648                 'ext': 'flv',
3649                 'title': title,
3650                 'uploader': uploader
3651                   }
3652         return [info]
3653
3654 class RBMARadioIE(InfoExtractor):
3655     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3656
3657     def _real_extract(self, url):
3658         m = re.match(self._VALID_URL, url)
3659         video_id = m.group('videoID')
3660
3661         webpage = self._download_webpage(url, video_id)
3662         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3663         if not m:
3664             raise ExtractorError(u'Cannot find metadata')
3665         json_data = m.group(1)
3666
3667         try:
3668             data = json.loads(json_data)
3669         except ValueError as e:
3670             raise ExtractorError(u'Invalid JSON: ' + str(e))
3671
3672         video_url = data['akamai_url'] + '&cbr=256'
3673         url_parts = compat_urllib_parse_urlparse(video_url)
3674         video_ext = url_parts.path.rpartition('.')[2]
3675         info = {
3676                 'id': video_id,
3677                 'url': video_url,
3678                 'ext': video_ext,
3679                 'title': data['title'],
3680                 'description': data.get('teaser_text'),
3681                 'location': data.get('country_of_origin'),
3682                 'uploader': data.get('host', {}).get('name'),
3683                 'uploader_id': data.get('host', {}).get('slug'),
3684                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3685                 'duration': data.get('duration'),
3686         }
3687         return [info]
3688
3689
3690 class YouPornIE(InfoExtractor):
3691     """Information extractor for youporn.com."""
3692     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3693
3694     def _print_formats(self, formats):
3695         """Print all available formats"""
3696         print(u'Available formats:')
3697         print(u'ext\t\tformat')
3698         print(u'---------------------------------')
3699         for format in formats:
3700             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3701
3702     def _specific(self, req_format, formats):
3703         for x in formats:
3704             if(x["format"]==req_format):
3705                 return x
3706         return None
3707
3708     def _real_extract(self, url):
3709         mobj = re.match(self._VALID_URL, url)
3710         if mobj is None:
3711             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3712             return
3713
3714         video_id = mobj.group('videoid')
3715
3716         req = compat_urllib_request.Request(url)
3717         req.add_header('Cookie', 'age_verified=1')
3718         webpage = self._download_webpage(req, video_id)
3719
3720         # Get the video title
3721         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3722         if result is None:
3723             raise ExtractorError(u'Unable to extract video title')
3724         video_title = result.group('title').strip()
3725
3726         # Get the video date
3727         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3728         if result is None:
3729             self._downloader.report_warning(u'unable to extract video date')
3730             upload_date = None
3731         else:
3732             upload_date = result.group('date').strip()
3733
3734         # Get the video uploader
3735         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3736         if result is None:
3737             self._downloader.report_warning(u'unable to extract uploader')
3738             video_uploader = None
3739         else:
3740             video_uploader = result.group('uploader').strip()
3741             video_uploader = clean_html( video_uploader )
3742
3743         # Get all of the formats available
3744         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3745         result = re.search(DOWNLOAD_LIST_RE, webpage)
3746         if result is None:
3747             raise ExtractorError(u'Unable to extract download list')
3748         download_list_html = result.group('download_list').strip()
3749
3750         # Get all of the links from the page
3751         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3752         links = re.findall(LINK_RE, download_list_html)
3753         if(len(links) == 0):
3754             raise ExtractorError(u'ERROR: no known formats available for video')
3755
3756         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3757
3758         formats = []
3759         for link in links:
3760
3761             # A link looks like this:
3762             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3763             # A path looks like this:
3764             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3765             video_url = unescapeHTML( link )
3766             path = compat_urllib_parse_urlparse( video_url ).path
3767             extension = os.path.splitext( path )[1][1:]
3768             format = path.split('/')[4].split('_')[:2]
3769             size = format[0]
3770             bitrate = format[1]
3771             format = "-".join( format )
3772             title = u'%s-%s-%s' % (video_title, size, bitrate)
3773
3774             formats.append({
3775                 'id': video_id,
3776                 'url': video_url,
3777                 'uploader': video_uploader,
3778                 'upload_date': upload_date,
3779                 'title': title,
3780                 'ext': extension,
3781                 'format': format,
3782                 'thumbnail': None,
3783                 'description': None,
3784                 'player_url': None
3785             })
3786
3787         if self._downloader.params.get('listformats', None):
3788             self._print_formats(formats)
3789             return
3790
3791         req_format = self._downloader.params.get('format', None)
3792         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3793
3794         if req_format is None or req_format == 'best':
3795             return [formats[0]]
3796         elif req_format == 'worst':
3797             return [formats[-1]]
3798         elif req_format in ('-1', 'all'):
3799             return formats
3800         else:
3801             format = self._specific( req_format, formats )
3802             if result is None:
3803                 self._downloader.trouble(u'ERROR: requested format not available')
3804                 return
3805             return [format]
3806
3807
3808
3809 class PornotubeIE(InfoExtractor):
3810     """Information extractor for pornotube.com."""
3811     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3812
3813     def _real_extract(self, url):
3814         mobj = re.match(self._VALID_URL, url)
3815         if mobj is None:
3816             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3817             return
3818
3819         video_id = mobj.group('videoid')
3820         video_title = mobj.group('title')
3821
3822         # Get webpage content
3823         webpage = self._download_webpage(url, video_id)
3824
3825         # Get the video URL
3826         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3827         result = re.search(VIDEO_URL_RE, webpage)
3828         if result is None:
3829             self._downloader.trouble(u'ERROR: unable to extract video url')
3830             return
3831         video_url = compat_urllib_parse.unquote(result.group('url'))
3832
3833         #Get the uploaded date
3834         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3835         result = re.search(VIDEO_UPLOADED_RE, webpage)
3836         if result is None:
3837             self._downloader.trouble(u'ERROR: unable to extract video title')
3838             return
3839         upload_date = result.group('date')
3840
3841         info = {'id': video_id,
3842                 'url': video_url,
3843                 'uploader': None,
3844                 'upload_date': upload_date,
3845                 'title': video_title,
3846                 'ext': 'flv',
3847                 'format': 'flv'}
3848
3849         return [info]
3850
3851 class YouJizzIE(InfoExtractor):
3852     """Information extractor for youjizz.com."""
3853     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3854
3855     def _real_extract(self, url):
3856         mobj = re.match(self._VALID_URL, url)
3857         if mobj is None:
3858             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3859             return
3860
3861         video_id = mobj.group('videoid')
3862
3863         # Get webpage content
3864         webpage = self._download_webpage(url, video_id)
3865
3866         # Get the video title
3867         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3868         if result is None:
3869             raise ExtractorError(u'ERROR: unable to extract video title')
3870         video_title = result.group('title').strip()
3871
3872         # Get the embed page
3873         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3874         if result is None:
3875             raise ExtractorError(u'ERROR: unable to extract embed page')
3876
3877         embed_page_url = result.group(0).strip()
3878         video_id = result.group('videoid')
3879
3880         webpage = self._download_webpage(embed_page_url, video_id)
3881
3882         # Get the video URL
3883         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3884         if result is None:
3885             raise ExtractorError(u'ERROR: unable to extract video url')
3886         video_url = result.group('source')
3887
3888         info = {'id': video_id,
3889                 'url': video_url,
3890                 'title': video_title,
3891                 'ext': 'flv',
3892                 'format': 'flv',
3893                 'player_url': embed_page_url}
3894
3895         return [info]
3896
3897 class EightTracksIE(InfoExtractor):
3898     IE_NAME = '8tracks'
3899     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3900
3901     def _real_extract(self, url):
3902         mobj = re.match(self._VALID_URL, url)
3903         if mobj is None:
3904             raise ExtractorError(u'Invalid URL: %s' % url)
3905         playlist_id = mobj.group('id')
3906
3907         webpage = self._download_webpage(url, playlist_id)
3908
3909         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3910         if not m:
3911             raise ExtractorError(u'Cannot find trax information')
3912         json_like = m.group(1)
3913         data = json.loads(json_like)
3914
3915         session = str(random.randint(0, 1000000000))
3916         mix_id = data['id']
3917         track_count = data['tracks_count']
3918         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3919         next_url = first_url
3920         res = []
3921         for i in itertools.count():
3922             api_json = self._download_webpage(next_url, playlist_id,
3923                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3924                 errnote=u'Failed to download song information')
3925             api_data = json.loads(api_json)
3926             track_data = api_data[u'set']['track']
3927             info = {
3928                 'id': track_data['id'],
3929                 'url': track_data['track_file_stream_url'],
3930                 'title': track_data['performer'] + u' - ' + track_data['name'],
3931                 'raw_title': track_data['name'],
3932                 'uploader_id': data['user']['login'],
3933                 'ext': 'm4a',
3934             }
3935             res.append(info)
3936             if api_data['set']['at_last_track']:
3937                 break
3938             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3939         return res
3940
3941 class KeekIE(InfoExtractor):
3942     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3943     IE_NAME = u'keek'
3944
3945     def _real_extract(self, url):
3946         m = re.match(self._VALID_URL, url)
3947         video_id = m.group('videoID')
3948         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3949         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3950         webpage = self._download_webpage(url, video_id)
3951         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3952         title = unescapeHTML(m.group('title'))
3953         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3954         uploader = unescapeHTML(m.group('uploader'))
3955         info = {
3956                 'id':video_id,
3957                 'url':video_url,
3958                 'ext': 'mp4',
3959                 'title': title,
3960                 'thumbnail': thumbnail,
3961                 'uploader': uploader
3962         }
3963         return [info]
3964
3965 class TEDIE(InfoExtractor):
3966     _VALID_URL=r'''http://www.ted.com/
3967                    (
3968                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3969                         |
3970                         ((?P<type_talk>talks)) # We have a simple talk
3971                    )
3972                    /(?P<name>\w+) # Here goes the name and then ".html"
3973                    '''
3974
3975     @classmethod
3976     def suitable(cls, url):
3977         """Receives a URL and returns True if suitable for this IE."""
3978         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3979
3980     def _real_extract(self, url):
3981         m=re.match(self._VALID_URL, url, re.VERBOSE)
3982         if m.group('type_talk'):
3983             return [self._talk_info(url)]
3984         else :
3985             playlist_id=m.group('playlist_id')
3986             name=m.group('name')
3987             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3988             return self._playlist_videos_info(url,name,playlist_id)
3989
3990     def _talk_video_link(self,mediaSlug):
3991         '''Returns the video link for that mediaSlug'''
3992         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3993
3994     def _playlist_videos_info(self,url,name,playlist_id=0):
3995         '''Returns the videos of the playlist'''
3996         video_RE=r'''
3997                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3998                      ([.\s]*?)data-playlist_item_id="(\d+)"
3999                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4000                      '''
4001         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4002         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4003         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4004         m_names=re.finditer(video_name_RE,webpage)
4005         info=[]
4006         for m_video, m_name in zip(m_videos,m_names):
4007             video_id=m_video.group('video_id')
4008             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4009             info.append(self._talk_info(talk_url,video_id))
4010         return info
4011
4012     def _talk_info(self, url, video_id=0):
4013         """Return the video for the talk in the url"""
4014         m=re.match(self._VALID_URL, url,re.VERBOSE)
4015         videoName=m.group('name')
4016         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4017         # If the url includes the language we get the title translated
4018         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4019         title=re.search(title_RE, webpage).group('title')
4020         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4021                         "id":(?P<videoID>[\d]+).*?
4022                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4023         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4024         thumb_match=re.search(thumb_RE,webpage)
4025         info_match=re.search(info_RE,webpage,re.VERBOSE)
4026         video_id=info_match.group('videoID')
4027         mediaSlug=info_match.group('mediaSlug')
4028         video_url=self._talk_video_link(mediaSlug)
4029         info = {
4030                 'id': video_id,
4031                 'url': video_url,
4032                 'ext': 'mp4',
4033                 'title': title,
4034                 'thumbnail': thumb_match.group('thumbnail')
4035                 }
4036         return info
4037
4038 class MySpassIE(InfoExtractor):
4039     _VALID_URL = r'http://www.myspass.de/.*'
4040
4041     def _real_extract(self, url):
4042         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4043
4044         # video id is the last path element of the URL
4045         # usually there is a trailing slash, so also try the second but last
4046         url_path = compat_urllib_parse_urlparse(url).path
4047         url_parent_path, video_id = os.path.split(url_path)
4048         if not video_id:
4049             _, video_id = os.path.split(url_parent_path)
4050
4051         # get metadata
4052         metadata_url = META_DATA_URL_TEMPLATE % video_id
4053         metadata_text = self._download_webpage(metadata_url, video_id)
4054         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4055
4056         # extract values from metadata
4057         url_flv_el = metadata.find('url_flv')
4058         if url_flv_el is None:
4059             self._downloader.trouble(u'ERROR: unable to extract download url')
4060             return
4061         video_url = url_flv_el.text
4062         extension = os.path.splitext(video_url)[1][1:]
4063         title_el = metadata.find('title')
4064         if title_el is None:
4065             self._downloader.trouble(u'ERROR: unable to extract title')
4066             return
4067         title = title_el.text
4068         format_id_el = metadata.find('format_id')
4069         if format_id_el is None:
4070             format = ext
4071         else:
4072             format = format_id_el.text
4073         description_el = metadata.find('description')
4074         if description_el is not None:
4075             description = description_el.text
4076         else:
4077             description = None
4078         imagePreview_el = metadata.find('imagePreview')
4079         if imagePreview_el is not None:
4080             thumbnail = imagePreview_el.text
4081         else:
4082             thumbnail = None
4083         info = {
4084             'id': video_id,
4085             'url': video_url,
4086             'title': title,
4087             'ext': extension,
4088             'format': format,
4089             'thumbnail': thumbnail,
4090             'description': description
4091         }
4092         return [info]
4093
4094 def gen_extractors():
4095     """ Return a list of an instance of every supported extractor.
4096     The order does matter; the first extractor matched is the one handling the URL.
4097     """
4098     return [
4099         YoutubePlaylistIE(),
4100         YoutubeChannelIE(),
4101         YoutubeUserIE(),
4102         YoutubeSearchIE(),
4103         YoutubeIE(),
4104         MetacafeIE(),
4105         DailymotionIE(),
4106         GoogleSearchIE(),
4107         PhotobucketIE(),
4108         YahooIE(),
4109         YahooSearchIE(),
4110         DepositFilesIE(),
4111         FacebookIE(),
4112         BlipTVUserIE(),
4113         BlipTVIE(),
4114         VimeoIE(),
4115         MyVideoIE(),
4116         ComedyCentralIE(),
4117         EscapistIE(),
4118         CollegeHumorIE(),
4119         XVideosIE(),
4120         SoundcloudIE(),
4121         InfoQIE(),
4122         MixcloudIE(),
4123         StanfordOpenClassroomIE(),
4124         MTVIE(),
4125         YoukuIE(),
4126         XNXXIE(),
4127         YouJizzIE(),
4128         PornotubeIE(),
4129         YouPornIE(),
4130         GooglePlusIE(),
4131         ArteTvIE(),
4132         NBAIE(),
4133         JustinTVIE(),
4134         FunnyOrDieIE(),
4135         SteamIE(),
4136         UstreamIE(),
4137         RBMARadioIE(),
4138         EightTracksIE(),
4139         KeekIE(),
4140         TEDIE(),
4141         MySpassIE(),
4142         GenericIE()
4143     ]
4144
4145