youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import netrc
   9 import os
  10 import re
  11 import socket
  12 import time
  13 import email.utils
  14 import xml.etree.ElementTree
  15 import random
  16 import math
  17
  18 from .utils import *
  19
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38     uploader:       Full name of the video uploader.
  39     upload_date:    Video upload date (YYYYMMDD).
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader_id:    Nickname or id of the video uploader.
  47     player_url:     SWF Player URL (used for rtmpdump).
  48     subtitles:      The .srt file contents.
  49     urlhandle:      [internal] The urlHandle to be used to download the file,
  50                     like returned by urllib.request.urlopen
  51
  52     The fields should all be Unicode strings.
  53
  54     Subclasses of this one should re-define the _real_initialize() and
  55     _real_extract() methods and define a _VALID_URL regexp.
  56     Probably, they should also be added to the list of extractors.
  57
  58     _real_extract() must return a *list* of information dictionaries as
  59     described above.
  60
  61     Finally, the _WORKING attribute should be set to False for broken IEs
  62     in order to warn the users and skip the tests.
  63     """
  64
  65     _ready = False
  66     _downloader = None
  67     _WORKING = True
  68
  69     def __init__(self, downloader=None):
  70         """Constructor. Receives an optional downloader."""
  71         self._ready = False
  72         self.set_downloader(downloader)
  73
  74     def suitable(self, url):
  75         """Receives a URL and returns True if suitable for this IE."""
  76         return re.match(self._VALID_URL, url) is not None
  77
  78     def working(self):
  79         """Getter method for _WORKING."""
  80         return self._WORKING
  81
  82     def initialize(self):
  83         """Initializes an instance (authentication, etc)."""
  84         if not self._ready:
  85             self._real_initialize()
  86             self._ready = True
  87
  88     def extract(self, url):
  89         """Extracts URL information and returns it in list of dicts."""
  90         self.initialize()
  91         return self._real_extract(url)
  92
  93     def set_downloader(self, downloader):
  94         """Sets the downloader for this IE."""
  95         self._downloader = downloader
  96
  97     def _real_initialize(self):
  98         """Real initialization process. Redefine in subclasses."""
  99         pass
 100
 101     def _real_extract(self, url):
 102         """Real extraction process. Redefine in subclasses."""
 103         pass
 104
 105     @property
 106     def IE_NAME(self):
 107         return type(self).__name__[:-2]
 108
 109     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 110         if note is None:
 111             note = u'Downloading video webpage'
 112         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 113         try:
 114             urlh = compat_urllib_request.urlopen(url_or_request)
 115             webpage_bytes = urlh.read()
 116             return webpage_bytes.decode('utf-8', 'replace')
 117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 118             if errnote is None:
 119                 errnote = u'Unable to download webpage'
 120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 121
 122
 123 class YoutubeIE(InfoExtractor):
 124     """Information extractor for youtube.com."""
 125
 126     _VALID_URL = r"""^
 127                      (
 128                          (?:https?://)?                                       # http(s):// (optional)
 129                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 130                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 131                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 132                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 133                          (?:                                                  # the various things that can precede the ID:
 134                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 135                              |(?:                                             # or the v= param in all its forms
 136                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 137                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 138                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 139                                  v=
 140                              )
 141                          )?                                                   # optional -> youtube.com/xxxx is OK
 142                      )?                                                       # all until now is optional -> you can pass the naked ID
 143                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 144                      (?(1).+)?                                                # if we found the ID, everything can follow
 145                      $"""
 146     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 147     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 148     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 149     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 150     _NETRC_MACHINE = 'youtube'
 151     # Listed in order of quality
 152     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 153     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 154     _video_extensions = {
 155         '13': '3gp',
 156         '17': 'mp4',
 157         '18': 'mp4',
 158         '22': 'mp4',
 159         '37': 'mp4',
 160         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 161         '43': 'webm',
 162         '44': 'webm',
 163         '45': 'webm',
 164         '46': 'webm',
 165     }
 166     _video_dimensions = {
 167         '5': '240x400',
 168         '6': '???',
 169         '13': '???',
 170         '17': '144x176',
 171         '18': '360x640',
 172         '22': '720x1280',
 173         '34': '360x640',
 174         '35': '480x854',
 175         '37': '1080x1920',
 176         '38': '3072x4096',
 177         '43': '360x640',
 178         '44': '480x854',
 179         '45': '720x1280',
 180         '46': '1080x1920',
 181     }
 182     IE_NAME = u'youtube'
 183
 184     def suitable(self, url):
 185         """Receives a URL and returns True if suitable for this IE."""
 186         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 187
 188     def report_lang(self):
 189         """Report attempt to set language."""
 190         self._downloader.to_screen(u'[youtube] Setting language')
 191
 192     def report_login(self):
 193         """Report attempt to log in."""
 194         self._downloader.to_screen(u'[youtube] Logging in')
 195
 196     def report_age_confirmation(self):
 197         """Report attempt to confirm age."""
 198         self._downloader.to_screen(u'[youtube] Confirming age')
 199
 200     def report_video_webpage_download(self, video_id):
 201         """Report attempt to download video webpage."""
 202         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 203
 204     def report_video_info_webpage_download(self, video_id):
 205         """Report attempt to download video info webpage."""
 206         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 207
 208     def report_video_subtitles_download(self, video_id):
 209         """Report attempt to download video info webpage."""
 210         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 211
 212     def report_information_extraction(self, video_id):
 213         """Report attempt to extract video information."""
 214         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 215
 216     def report_unavailable_format(self, video_id, format):
 217         """Report extracted video URL."""
 218         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 219
 220     def report_rtmp_download(self):
 221         """Indicate the download will use the RTMP protocol."""
 222         self._downloader.to_screen(u'[youtube] RTMP download detected')
 223
 224     def _closed_captions_xml_to_srt(self, xml_string):
 225         srt = ''
 226         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 227         # TODO parse xml instead of regex
 228         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 229             if not dur: dur = '4'
 230             start = float(start)
 231             end = start + float(dur)
 232             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 233             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 234             caption = unescapeHTML(caption)
 235             caption = unescapeHTML(caption) # double cycle, intentional
 236             srt += str(n+1) + '\n'
 237             srt += start + ' --> ' + end + '\n'
 238             srt += caption + '\n\n'
 239         return srt
 240
 241     def _extract_subtitles(self, video_id):
 242         self.report_video_subtitles_download(video_id)
 243         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 244         try:
 245             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 247             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 248         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 249         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 250         if not srt_lang_list:
 251             return (u'WARNING: video has no closed captions', None)
 252         if self._downloader.params.get('subtitleslang', False):
 253             srt_lang = self._downloader.params.get('subtitleslang')
 254         elif 'en' in srt_lang_list:
 255             srt_lang = 'en'
 256         else:
 257             srt_lang = list(srt_lang_list.keys())[0]
 258         if not srt_lang in srt_lang_list:
 259             return (u'WARNING: no closed captions found in the specified language', None)
 260         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 261         try:
 262             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 264             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 265         if not srt_xml:
 266             return (u'WARNING: unable to download video subtitles', None)
 267         return (None, self._closed_captions_xml_to_srt(srt_xml))
 268
 269     def _print_formats(self, formats):
 270         print('Available formats:')
 271         for x in formats:
 272             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 273
 274     def _real_initialize(self):
 275         if self._downloader is None:
 276             return
 277
 278         username = None
 279         password = None
 280         downloader_params = self._downloader.params
 281
 282         # Attempt to use provided username and password or .netrc data
 283         if downloader_params.get('username', None) is not None:
 284             username = downloader_params['username']
 285             password = downloader_params['password']
 286         elif downloader_params.get('usenetrc', False):
 287             try:
 288                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 289                 if info is not None:
 290                     username = info[0]
 291                     password = info[2]
 292                 else:
 293                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 294             except (IOError, netrc.NetrcParseError) as err:
 295                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 296                 return
 297
 298         # Set language
 299         request = compat_urllib_request.Request(self._LANG_URL)
 300         try:
 301             self.report_lang()
 302             compat_urllib_request.urlopen(request).read()
 303         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 304             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 305             return
 306
 307         # No authentication to be performed
 308         if username is None:
 309             return
 310
 311         # Log in
 312         login_form = {
 313                 'current_form': 'loginForm',
 314                 'next':     '/',
 315                 'action_login': 'Log In',
 316                 'username': username,
 317                 'password': password,
 318                 }
 319         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 320         try:
 321             self.report_login()
 322             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 323             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 324                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 325                 return
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 328             return
 329
 330         # Confirm age
 331         age_form = {
 332                 'next_url':     '/',
 333                 'action_confirm':   'Confirm',
 334                 }
 335         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 336         try:
 337             self.report_age_confirmation()
 338             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 339         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 340             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 341             return
 342
 343     def _extract_id(self, url):
 344         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 345         if mobj is None:
 346             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 347             return
 348         video_id = mobj.group(2)
 349         return video_id
 350
 351     def _real_extract(self, url):
 352         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 353         mobj = re.search(self._NEXT_URL_RE, url)
 354         if mobj:
 355             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 356         video_id = self._extract_id(url)
 357
 358         # Get video webpage
 359         self.report_video_webpage_download(video_id)
 360         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 361         request = compat_urllib_request.Request(url)
 362         try:
 363             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 365             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 366             return
 367
 368         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 369
 370         # Attempt to extract SWF player URL
 371         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 372         if mobj is not None:
 373             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 374         else:
 375             player_url = None
 376
 377         # Get video info
 378         self.report_video_info_webpage_download(video_id)
 379         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 380             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 381                     % (video_id, el_type))
 382             request = compat_urllib_request.Request(video_info_url)
 383             try:
 384                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 385                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 386                 video_info = compat_parse_qs(video_info_webpage)
 387                 if 'token' in video_info:
 388                     break
 389             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 390                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 391                 return
 392         if 'token' not in video_info:
 393             if 'reason' in video_info:
 394                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 395             else:
 396                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 397             return
 398
 399         # Check for "rental" videos
 400         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 401             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 402             return
 403
 404         # Start extracting information
 405         self.report_information_extraction(video_id)
 406
 407         # uploader
 408         if 'author' not in video_info:
 409             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 410             return
 411         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 412
 413         # uploader_id
 414         video_uploader_id = None
 415         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 416         if mobj is not None:
 417             video_uploader_id = mobj.group(1)
 418         else:
 419             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 420
 421         # title
 422         if 'title' not in video_info:
 423             self._downloader.trouble(u'ERROR: unable to extract video title')
 424             return
 425         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 426
 427         # thumbnail image
 428         if 'thumbnail_url' not in video_info:
 429             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 430             video_thumbnail = ''
 431         else:   # don't panic if we can't find it
 432             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 433
 434         # upload date
 435         upload_date = None
 436         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 437         if mobj is not None:
 438             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 439             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 440             for expression in format_expressions:
 441                 try:
 442                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 443                 except:
 444                     pass
 445
 446         # description
 447         video_description = get_element_by_id("eow-description", video_webpage)
 448         if video_description:
 449             video_description = clean_html(video_description)
 450         else:
 451             video_description = ''
 452
 453         # closed captions
 454         video_subtitles = None
 455         if self._downloader.params.get('writesubtitles', False):
 456             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 457             if srt_error:
 458                 self._downloader.trouble(srt_error)
 459
 460         if 'length_seconds' not in video_info:
 461             self._downloader.trouble(u'WARNING: unable to extract video duration')
 462             video_duration = ''
 463         else:
 464             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 465
 466         # token
 467         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 468
 469         # Decide which formats to download
 470         req_format = self._downloader.params.get('format', None)
 471
 472         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 473             self.report_rtmp_download()
 474             video_url_list = [(None, video_info['conn'][0])]
 475         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 476             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 477             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 478             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 479             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 480
 481             format_limit = self._downloader.params.get('format_limit', None)
 482             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 483             if format_limit is not None and format_limit in available_formats:
 484                 format_list = available_formats[available_formats.index(format_limit):]
 485             else:
 486                 format_list = available_formats
 487             existing_formats = [x for x in format_list if x in url_map]
 488             if len(existing_formats) == 0:
 489                 self._downloader.trouble(u'ERROR: no known formats available for video')
 490                 return
 491             if self._downloader.params.get('listformats', None):
 492                 self._print_formats(existing_formats)
 493                 return
 494             if req_format is None or req_format == 'best':
 495                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 496             elif req_format == 'worst':
 497                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 498             elif req_format in ('-1', 'all'):
 499                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 500             else:
 501                 # Specific formats. We pick the first in a slash-delimeted sequence.
 502                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 503                 req_formats = req_format.split('/')
 504                 video_url_list = None
 505                 for rf in req_formats:
 506                     if rf in url_map:
 507                         video_url_list = [(rf, url_map[rf])]
 508                         break
 509                 if video_url_list is None:
 510                     self._downloader.trouble(u'ERROR: requested format not available')
 511                     return
 512         else:
 513             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 514             return
 515
 516         results = []
 517         for format_param, video_real_url in video_url_list:
 518             # Extension
 519             video_extension = self._video_extensions.get(format_param, 'flv')
 520
 521             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 522                                               self._video_dimensions.get(format_param, '???'))
 523
 524             results.append({
 525                 'id':       video_id,
 526                 'url':      video_real_url,
 527                 'uploader': video_uploader,
 528                 'uploader_id': video_uploader_id,
 529                 'upload_date':  upload_date,
 530                 'title':    video_title,
 531                 'ext':      video_extension,
 532                 'format':   video_format,
 533                 'thumbnail':    video_thumbnail,
 534                 'description':  video_description,
 535                 'player_url':   player_url,
 536                 'subtitles':    video_subtitles,
 537                 'duration':     video_duration
 538             })
 539         return results
 540
 541
 542 class MetacafeIE(InfoExtractor):
 543     """Information Extractor for metacafe.com."""
 544
 545     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 546     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 547     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 548     IE_NAME = u'metacafe'
 549
 550     def __init__(self, downloader=None):
 551         InfoExtractor.__init__(self, downloader)
 552
 553     def report_disclaimer(self):
 554         """Report disclaimer retrieval."""
 555         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 556
 557     def report_age_confirmation(self):
 558         """Report attempt to confirm age."""
 559         self._downloader.to_screen(u'[metacafe] Confirming age')
 560
 561     def report_download_webpage(self, video_id):
 562         """Report webpage download."""
 563         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 564
 565     def report_extraction(self, video_id):
 566         """Report information extraction."""
 567         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 568
 569     def _real_initialize(self):
 570         # Retrieve disclaimer
 571         request = compat_urllib_request.Request(self._DISCLAIMER)
 572         try:
 573             self.report_disclaimer()
 574             disclaimer = compat_urllib_request.urlopen(request).read()
 575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 576             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 577             return
 578
 579         # Confirm age
 580         disclaimer_form = {
 581             'filters': '0',
 582             'submit': "Continue - I'm over 18",
 583             }
 584         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 585         try:
 586             self.report_age_confirmation()
 587             disclaimer = compat_urllib_request.urlopen(request).read()
 588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 589             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 590             return
 591
 592     def _real_extract(self, url):
 593         # Extract id and simplified title from URL
 594         mobj = re.match(self._VALID_URL, url)
 595         if mobj is None:
 596             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 597             return
 598
 599         video_id = mobj.group(1)
 600
 601         # Check if video comes from YouTube
 602         mobj2 = re.match(r'^yt-(.*)$', video_id)
 603         if mobj2 is not None:
 604             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 605             return
 606
 607         # Retrieve video webpage to extract further information
 608         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 609         try:
 610             self.report_download_webpage(video_id)
 611             webpage = compat_urllib_request.urlopen(request).read()
 612         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 613             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 614             return
 615
 616         # Extract URL, uploader and title from webpage
 617         self.report_extraction(video_id)
 618         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 619         if mobj is not None:
 620             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 621             video_extension = mediaURL[-3:]
 622
 623             # Extract gdaKey if available
 624             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 625             if mobj is None:
 626                 video_url = mediaURL
 627             else:
 628                 gdaKey = mobj.group(1)
 629                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 630         else:
 631             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 632             if mobj is None:
 633                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 634                 return
 635             vardict = compat_parse_qs(mobj.group(1))
 636             if 'mediaData' not in vardict:
 637                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 638                 return
 639             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 640             if mobj is None:
 641                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                 return
 643             mediaURL = mobj.group(1).replace('\\/', '/')
 644             video_extension = mediaURL[-3:]
 645             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 646
 647         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 648         if mobj is None:
 649             self._downloader.trouble(u'ERROR: unable to extract title')
 650             return
 651         video_title = mobj.group(1).decode('utf-8')
 652
 653         mobj = re.search(r'submitter=(.*?);', webpage)
 654         if mobj is None:
 655             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656             return
 657         video_uploader = mobj.group(1)
 658
 659         return [{
 660             'id':       video_id.decode('utf-8'),
 661             'url':      video_url.decode('utf-8'),
 662             'uploader': video_uploader.decode('utf-8'),
 663             'upload_date':  None,
 664             'title':    video_title,
 665             'ext':      video_extension.decode('utf-8'),
 666         }]
 667
 668
 669 class DailymotionIE(InfoExtractor):
 670     """Information Extractor for Dailymotion"""
 671
 672     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 673     IE_NAME = u'dailymotion'
 674
 675     def __init__(self, downloader=None):
 676         InfoExtractor.__init__(self, downloader)
 677
 678     def report_extraction(self, video_id):
 679         """Report information extraction."""
 680         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 681
 682     def _real_extract(self, url):
 683         # Extract id and simplified title from URL
 684         mobj = re.match(self._VALID_URL, url)
 685         if mobj is None:
 686             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 687             return
 688
 689         video_id = mobj.group(1).split('_')[0].split('?')[0]
 690
 691         video_extension = 'mp4'
 692
 693         # Retrieve video webpage to extract further information
 694         request = compat_urllib_request.Request(url)
 695         request.add_header('Cookie', 'family_filter=off')
 696         webpage = self._download_webpage(request, video_id)
 697
 698         # Extract URL, uploader and title from webpage
 699         self.report_extraction(video_id)
 700         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: unable to extract media URL')
 703             return
 704         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 705
 706         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 707             if key in flashvars:
 708                 max_quality = key
 709                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 710                 break
 711         else:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 716         if mobj is None:
 717             self._downloader.trouble(u'ERROR: unable to extract video URL')
 718             return
 719
 720         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 721
 722         # TODO: support choosing qualities
 723
 724         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 725         if mobj is None:
 726             self._downloader.trouble(u'ERROR: unable to extract title')
 727             return
 728         video_title = unescapeHTML(mobj.group('title'))
 729
 730         video_uploader = None
 731         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 732         if mobj is None:
 733             # lookin for official user
 734             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 735             if mobj_official is None:
 736                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 737             else:
 738                 video_uploader = mobj_official.group(1)
 739         else:
 740             video_uploader = mobj.group(1)
 741
 742         video_upload_date = None
 743         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 744         if mobj is not None:
 745             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 746
 747         return [{
 748             'id':       video_id,
 749             'url':      video_url,
 750             'uploader': video_uploader,
 751             'upload_date':  video_upload_date,
 752             'title':    video_title,
 753             'ext':      video_extension,
 754         }]
 755
 756
 757 class PhotobucketIE(InfoExtractor):
 758     """Information extractor for photobucket.com."""
 759
 760     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 761     IE_NAME = u'photobucket'
 762
 763     def __init__(self, downloader=None):
 764         InfoExtractor.__init__(self, downloader)
 765
 766     def report_download_webpage(self, video_id):
 767         """Report webpage download."""
 768         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 769
 770     def report_extraction(self, video_id):
 771         """Report information extraction."""
 772         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 773
 774     def _real_extract(self, url):
 775         # Extract id from URL
 776         mobj = re.match(self._VALID_URL, url)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 779             return
 780
 781         video_id = mobj.group(1)
 782
 783         video_extension = 'flv'
 784
 785         # Retrieve video webpage to extract further information
 786         request = compat_urllib_request.Request(url)
 787         try:
 788             self.report_download_webpage(video_id)
 789             webpage = compat_urllib_request.urlopen(request).read()
 790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 791             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 792             return
 793
 794         # Extract URL, uploader, and title from webpage
 795         self.report_extraction(video_id)
 796         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 797         if mobj is None:
 798             self._downloader.trouble(u'ERROR: unable to extract media URL')
 799             return
 800         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 801
 802         video_url = mediaURL
 803
 804         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 805         if mobj is None:
 806             self._downloader.trouble(u'ERROR: unable to extract title')
 807             return
 808         video_title = mobj.group(1).decode('utf-8')
 809
 810         video_uploader = mobj.group(2).decode('utf-8')
 811
 812         return [{
 813             'id':       video_id.decode('utf-8'),
 814             'url':      video_url.decode('utf-8'),
 815             'uploader': video_uploader,
 816             'upload_date':  None,
 817             'title':    video_title,
 818             'ext':      video_extension.decode('utf-8'),
 819         }]
 820
 821
 822 class YahooIE(InfoExtractor):
 823     """Information extractor for video.yahoo.com."""
 824
 825     _WORKING = False
 826     # _VALID_URL matches all Yahoo! Video URLs
 827     # _VPAGE_URL matches only the extractable '/watch/' URLs
 828     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 829     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 830     IE_NAME = u'video.yahoo'
 831
 832     def __init__(self, downloader=None):
 833         InfoExtractor.__init__(self, downloader)
 834
 835     def report_download_webpage(self, video_id):
 836         """Report webpage download."""
 837         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 838
 839     def report_extraction(self, video_id):
 840         """Report information extraction."""
 841         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 842
 843     def _real_extract(self, url, new_video=True):
 844         # Extract ID from URL
 845         mobj = re.match(self._VALID_URL, url)
 846         if mobj is None:
 847             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 848             return
 849
 850         video_id = mobj.group(2)
 851         video_extension = 'flv'
 852
 853         # Rewrite valid but non-extractable URLs as
 854         # extractable English language /watch/ URLs
 855         if re.match(self._VPAGE_URL, url) is None:
 856             request = compat_urllib_request.Request(url)
 857             try:
 858                 webpage = compat_urllib_request.urlopen(request).read()
 859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 860                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 861                 return
 862
 863             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 864             if mobj is None:
 865                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 866                 return
 867             yahoo_id = mobj.group(1)
 868
 869             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 870             if mobj is None:
 871                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 872                 return
 873             yahoo_vid = mobj.group(1)
 874
 875             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 876             return self._real_extract(url, new_video=False)
 877
 878         # Retrieve video webpage to extract further information
 879         request = compat_urllib_request.Request(url)
 880         try:
 881             self.report_download_webpage(video_id)
 882             webpage = compat_urllib_request.urlopen(request).read()
 883         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 884             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 885             return
 886
 887         # Extract uploader and title from webpage
 888         self.report_extraction(video_id)
 889         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 890         if mobj is None:
 891             self._downloader.trouble(u'ERROR: unable to extract video title')
 892             return
 893         video_title = mobj.group(1).decode('utf-8')
 894
 895         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 896         if mobj is None:
 897             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 898             return
 899         video_uploader = mobj.group(1).decode('utf-8')
 900
 901         # Extract video thumbnail
 902         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 903         if mobj is None:
 904             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 905             return
 906         video_thumbnail = mobj.group(1).decode('utf-8')
 907
 908         # Extract video description
 909         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 910         if mobj is None:
 911             self._downloader.trouble(u'ERROR: unable to extract video description')
 912             return
 913         video_description = mobj.group(1).decode('utf-8')
 914         if not video_description:
 915             video_description = 'No description available.'
 916
 917         # Extract video height and width
 918         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 919         if mobj is None:
 920             self._downloader.trouble(u'ERROR: unable to extract video height')
 921             return
 922         yv_video_height = mobj.group(1)
 923
 924         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 925         if mobj is None:
 926             self._downloader.trouble(u'ERROR: unable to extract video width')
 927             return
 928         yv_video_width = mobj.group(1)
 929
 930         # Retrieve video playlist to extract media URL
 931         # I'm not completely sure what all these options are, but we
 932         # seem to need most of them, otherwise the server sends a 401.
 933         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 934         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 935         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 936                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 937                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 938         try:
 939             self.report_download_webpage(video_id)
 940             webpage = compat_urllib_request.urlopen(request).read()
 941         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 942             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 943             return
 944
 945         # Extract media URL from playlist XML
 946         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 947         if mobj is None:
 948             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 949             return
 950         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 951         video_url = unescapeHTML(video_url)
 952
 953         return [{
 954             'id':       video_id.decode('utf-8'),
 955             'url':      video_url,
 956             'uploader': video_uploader,
 957             'upload_date':  None,
 958             'title':    video_title,
 959             'ext':      video_extension.decode('utf-8'),
 960             'thumbnail':    video_thumbnail.decode('utf-8'),
 961             'description':  video_description,
 962         }]
 963
 964
 965 class VimeoIE(InfoExtractor):
 966     """Information extractor for vimeo.com."""
 967
 968     # _VALID_URL matches Vimeo URLs
 969     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 970     IE_NAME = u'vimeo'
 971
 972     def __init__(self, downloader=None):
 973         InfoExtractor.__init__(self, downloader)
 974
 975     def report_download_webpage(self, video_id):
 976         """Report webpage download."""
 977         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 978
 979     def report_extraction(self, video_id):
 980         """Report information extraction."""
 981         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 982
 983     def _real_extract(self, url, new_video=True):
 984         # Extract ID from URL
 985         mobj = re.match(self._VALID_URL, url)
 986         if mobj is None:
 987             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 988             return
 989
 990         video_id = mobj.group(1)
 991
 992         # Retrieve video webpage to extract further information
 993         request = compat_urllib_request.Request(url, None, std_headers)
 994         try:
 995             self.report_download_webpage(video_id)
 996             webpage_bytes = compat_urllib_request.urlopen(request).read()
 997             webpage = webpage_bytes.decode('utf-8')
 998         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 999             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1000             return
1001
1002         # Now we begin extracting as much information as we can from what we
1003         # retrieved. First we extract the information common to all extractors,
1004         # and latter we extract those that are Vimeo specific.
1005         self.report_extraction(video_id)
1006
1007         # Extract the config JSON
1008         try:
1009             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010             config = json.loads(config)
1011         except:
1012             self._downloader.trouble(u'ERROR: unable to extract info section')
1013             return
1014
1015         # Extract title
1016         video_title = config["video"]["title"]
1017
1018         # Extract uploader and uploader_id
1019         video_uploader = config["video"]["owner"]["name"]
1020         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1021
1022         # Extract video thumbnail
1023         video_thumbnail = config["video"]["thumbnail"]
1024
1025         # Extract video description
1026         video_description = get_element_by_attribute("itemprop", "description", webpage)
1027         if video_description: video_description = clean_html(video_description)
1028         else: video_description = ''
1029
1030         # Extract upload date
1031         video_upload_date = None
1032         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033         if mobj is not None:
1034             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1035
1036         # Vimeo specific: extract request signature and timestamp
1037         sig = config['request']['signature']
1038         timestamp = config['request']['timestamp']
1039
1040         # Vimeo specific: extract video codec and quality information
1041         # First consider quality, then codecs, then take everything
1042         # TODO bind to format param
1043         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044         files = { 'hd': [], 'sd': [], 'other': []}
1045         for codec_name, codec_extension in codecs:
1046             if codec_name in config["video"]["files"]:
1047                 if 'hd' in config["video"]["files"][codec_name]:
1048                     files['hd'].append((codec_name, codec_extension, 'hd'))
1049                 elif 'sd' in config["video"]["files"][codec_name]:
1050                     files['sd'].append((codec_name, codec_extension, 'sd'))
1051                 else:
1052                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1053
1054         for quality in ('hd', 'sd', 'other'):
1055             if len(files[quality]) > 0:
1056                 video_quality = files[quality][0][2]
1057                 video_codec = files[quality][0][0]
1058                 video_extension = files[quality][0][1]
1059                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1060                 break
1061         else:
1062             self._downloader.trouble(u'ERROR: no known codec found')
1063             return
1064
1065         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1067
1068         return [{
1069             'id':       video_id,
1070             'url':      video_url,
1071             'uploader': video_uploader,
1072             'uploader_id': video_uploader_id,
1073             'upload_date':  video_upload_date,
1074             'title':    video_title,
1075             'ext':      video_extension,
1076             'thumbnail':    video_thumbnail,
1077             'description':  video_description,
1078         }]
1079
1080
1081 class ArteTvIE(InfoExtractor):
1082     """arte.tv information extractor."""
1083
1084     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085     _LIVE_URL = r'index-[0-9]+\.html$'
1086
1087     IE_NAME = u'arte.tv'
1088
1089     def __init__(self, downloader=None):
1090         InfoExtractor.__init__(self, downloader)
1091
1092     def report_download_webpage(self, video_id):
1093         """Report webpage download."""
1094         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1095
1096     def report_extraction(self, video_id):
1097         """Report information extraction."""
1098         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1099
1100     def fetch_webpage(self, url):
1101         request = compat_urllib_request.Request(url)
1102         try:
1103             self.report_download_webpage(url)
1104             webpage = compat_urllib_request.urlopen(request).read()
1105         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1107             return
1108         except ValueError as err:
1109             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110             return
1111         return webpage
1112
1113     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114         page = self.fetch_webpage(url)
1115         mobj = re.search(regex, page, regexFlags)
1116         info = {}
1117
1118         if mobj is None:
1119             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120             return
1121
1122         for (i, key, err) in matchTuples:
1123             if mobj.group(i) is None:
1124                 self._downloader.trouble(err)
1125                 return
1126             else:
1127                 info[key] = mobj.group(i)
1128
1129         return info
1130
1131     def extractLiveStream(self, url):
1132         video_lang = url.split('/')[-4]
1133         info = self.grep_webpage(
1134             url,
1135             r'src="(.*?/videothek_js.*?\.js)',
1136             0,
1137             [
1138                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1139             ]
1140         )
1141         http_host = url.split('/')[2]
1142         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143         info = self.grep_webpage(
1144             next_url,
1145             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146                 '(http://.*?\.swf).*?' +
1147                 '(rtmp://.*?)\'',
1148             re.DOTALL,
1149             [
1150                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1151                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1153             ]
1154         )
1155         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1156
1157     def extractPlus7Stream(self, url):
1158         video_lang = url.split('/')[-3]
1159         info = self.grep_webpage(
1160             url,
1161             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1162             0,
1163             [
1164                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1165             ]
1166         )
1167         next_url = compat_urllib_parse.unquote(info.get('url'))
1168         info = self.grep_webpage(
1169             next_url,
1170             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1171             0,
1172             [
1173                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1174             ]
1175         )
1176         next_url = compat_urllib_parse.unquote(info.get('url'))
1177
1178         info = self.grep_webpage(
1179             next_url,
1180             r'<video id="(.*?)".*?>.*?' +
1181                 '<name>(.*?)</name>.*?' +
1182                 '<dateVideo>(.*?)</dateVideo>.*?' +
1183                 '<url quality="hd">(.*?)</url>',
1184             re.DOTALL,
1185             [
1186                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1187                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1189                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1190             ]
1191         )
1192
1193         return {
1194             'id':           info.get('id'),
1195             'url':          compat_urllib_parse.unquote(info.get('url')),
1196             'uploader':     u'arte.tv',
1197             'upload_date':  info.get('date'),
1198             'title':        info.get('title').decode('utf-8'),
1199             'ext':          u'mp4',
1200             'format':       u'NA',
1201             'player_url':   None,
1202         }
1203
1204     def _real_extract(self, url):
1205         video_id = url.split('/')[-1]
1206         self.report_extraction(video_id)
1207
1208         if re.search(self._LIVE_URL, video_id) is not None:
1209             self.extractLiveStream(url)
1210             return
1211         else:
1212             info = self.extractPlus7Stream(url)
1213
1214         return [info]
1215
1216
1217 class GenericIE(InfoExtractor):
1218     """Generic last-resort information extractor."""
1219
1220     _VALID_URL = r'.*'
1221     IE_NAME = u'generic'
1222
1223     def __init__(self, downloader=None):
1224         InfoExtractor.__init__(self, downloader)
1225
1226     def report_download_webpage(self, video_id):
1227         """Report webpage download."""
1228         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1230
1231     def report_extraction(self, video_id):
1232         """Report information extraction."""
1233         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1234
1235     def report_following_redirect(self, new_url):
1236         """Report information extraction."""
1237         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1238
1239     def _test_redirect(self, url):
1240         """Check if it is a redirect, like url shorteners, in case restart chain."""
1241         class HeadRequest(compat_urllib_request.Request):
1242             def get_method(self):
1243                 return "HEAD"
1244
1245         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1246             """
1247             Subclass the HTTPRedirectHandler to make it use our
1248             HeadRequest also on the redirected URL
1249             """
1250             def redirect_request(self, req, fp, code, msg, headers, newurl):
1251                 if code in (301, 302, 303, 307):
1252                     newurl = newurl.replace(' ', '%20')
1253                     newheaders = dict((k,v) for k,v in req.headers.items()
1254                                       if k.lower() not in ("content-length", "content-type"))
1255                     return HeadRequest(newurl,
1256                                        headers=newheaders,
1257                                        origin_req_host=req.get_origin_req_host(),
1258                                        unverifiable=True)
1259                 else:
1260                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1261
1262         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1263             """
1264             Fallback to GET if HEAD is not allowed (405 HTTP error)
1265             """
1266             def http_error_405(self, req, fp, code, msg, headers):
1267                 fp.read()
1268                 fp.close()
1269
1270                 newheaders = dict((k,v) for k,v in req.headers.items()
1271                                   if k.lower() not in ("content-length", "content-type"))
1272                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1273                                                  headers=newheaders,
1274                                                  origin_req_host=req.get_origin_req_host(),
1275                                                  unverifiable=True))
1276
1277         # Build our opener
1278         opener = compat_urllib_request.OpenerDirector()
1279         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280                         HTTPMethodFallback, HEADRedirectHandler,
1281                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282             opener.add_handler(handler())
1283
1284         response = opener.open(HeadRequest(url))
1285         new_url = response.geturl()
1286
1287         if url == new_url:
1288             return False
1289
1290         self.report_following_redirect(new_url)
1291         self._downloader.download([new_url])
1292         return True
1293
1294     def _real_extract(self, url):
1295         if self._test_redirect(url): return
1296
1297         video_id = url.split('/')[-1]
1298         request = compat_urllib_request.Request(url)
1299         try:
1300             self.report_download_webpage(video_id)
1301             webpage = compat_urllib_request.urlopen(request).read()
1302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1304             return
1305         except ValueError as err:
1306             # since this is the last-resort InfoExtractor, if
1307             # this error is thrown, it'll be thrown here
1308             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309             return
1310
1311         self.report_extraction(video_id)
1312         # Start with something easy: JW Player in SWFObject
1313         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1314         if mobj is None:
1315             # Broaden the search a little bit
1316             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1317         if mobj is None:
1318             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319             return
1320
1321         # It's possible that one of the regexes
1322         # matched, but returned an empty group:
1323         if mobj.group(1) is None:
1324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325             return
1326
1327         video_url = compat_urllib_parse.unquote(mobj.group(1))
1328         video_id = os.path.basename(video_url)
1329
1330         # here's a fun little line of code for you:
1331         video_extension = os.path.splitext(video_id)[1][1:]
1332         video_id = os.path.splitext(video_id)[0]
1333
1334         # it's tempting to parse this further, but you would
1335         # have to take into account all the variations like
1336         #   Video Title - Site Name
1337         #   Site Name | Video Title
1338         #   Video Title - Tagline | Site Name
1339         # and so on and so forth; it's just not practical
1340         mobj = re.search(r'<title>(.*)</title>', webpage)
1341         if mobj is None:
1342             self._downloader.trouble(u'ERROR: unable to extract title')
1343             return
1344         video_title = mobj.group(1)
1345
1346         # video uploader is domain name
1347         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1348         if mobj is None:
1349             self._downloader.trouble(u'ERROR: unable to extract title')
1350             return
1351         video_uploader = mobj.group(1)
1352
1353         return [{
1354             'id':       video_id,
1355             'url':      video_url,
1356             'uploader': video_uploader,
1357             'upload_date':  None,
1358             'title':    video_title,
1359             'ext':      video_extension,
1360         }]
1361
1362
1363 class YoutubeSearchIE(InfoExtractor):
1364     """Information Extractor for YouTube search queries."""
1365     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367     _max_youtube_results = 1000
1368     IE_NAME = u'youtube:search'
1369
1370     def __init__(self, downloader=None):
1371         InfoExtractor.__init__(self, downloader)
1372
1373     def report_download_page(self, query, pagenum):
1374         """Report attempt to download search page with given number."""
1375         query = query.decode(preferredencoding())
1376         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1377
1378     def _real_extract(self, query):
1379         mobj = re.match(self._VALID_URL, query)
1380         if mobj is None:
1381             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1382             return
1383
1384         prefix, query = query.split(':')
1385         prefix = prefix[8:]
1386         query = query.encode('utf-8')
1387         if prefix == '':
1388             self._download_n_results(query, 1)
1389             return
1390         elif prefix == 'all':
1391             self._download_n_results(query, self._max_youtube_results)
1392             return
1393         else:
1394             try:
1395                 n = int(prefix)
1396                 if n <= 0:
1397                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1398                     return
1399                 elif n > self._max_youtube_results:
1400                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401                     n = self._max_youtube_results
1402                 self._download_n_results(query, n)
1403                 return
1404             except ValueError: # parsing prefix as integer fails
1405                 self._download_n_results(query, 1)
1406                 return
1407
1408     def _download_n_results(self, query, n):
1409         """Downloads a specified number of results for a query"""
1410
1411         video_ids = []
1412         pagenum = 0
1413         limit = n
1414
1415         while (50 * pagenum) < limit:
1416             self.report_download_page(query, pagenum+1)
1417             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418             request = compat_urllib_request.Request(result_url)
1419             try:
1420                 data = compat_urllib_request.urlopen(request).read()
1421             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1423                 return
1424             api_response = json.loads(data)['data']
1425
1426             new_ids = list(video['id'] for video in api_response['items'])
1427             video_ids += new_ids
1428
1429             limit = min(n, api_response['totalItems'])
1430             pagenum += 1
1431
1432         if len(video_ids) > n:
1433             video_ids = video_ids[:n]
1434         for id in video_ids:
1435             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1436         return
1437
1438
1439 class GoogleSearchIE(InfoExtractor):
1440     """Information Extractor for Google Video search queries."""
1441     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445     _max_google_results = 1000
1446     IE_NAME = u'video.google:search'
1447
1448     def __init__(self, downloader=None):
1449         InfoExtractor.__init__(self, downloader)
1450
1451     def report_download_page(self, query, pagenum):
1452         """Report attempt to download playlist page with given number."""
1453         query = query.decode(preferredencoding())
1454         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1455
1456     def _real_extract(self, query):
1457         mobj = re.match(self._VALID_URL, query)
1458         if mobj is None:
1459             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1460             return
1461
1462         prefix, query = query.split(':')
1463         prefix = prefix[8:]
1464         query = query.encode('utf-8')
1465         if prefix == '':
1466             self._download_n_results(query, 1)
1467             return
1468         elif prefix == 'all':
1469             self._download_n_results(query, self._max_google_results)
1470             return
1471         else:
1472             try:
1473                 n = int(prefix)
1474                 if n <= 0:
1475                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476                     return
1477                 elif n > self._max_google_results:
1478                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479                     n = self._max_google_results
1480                 self._download_n_results(query, n)
1481                 return
1482             except ValueError: # parsing prefix as integer fails
1483                 self._download_n_results(query, 1)
1484                 return
1485
1486     def _download_n_results(self, query, n):
1487         """Downloads a specified number of results for a query"""
1488
1489         video_ids = []
1490         pagenum = 0
1491
1492         while True:
1493             self.report_download_page(query, pagenum)
1494             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495             request = compat_urllib_request.Request(result_url)
1496             try:
1497                 page = compat_urllib_request.urlopen(request).read()
1498             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1500                 return
1501
1502             # Extract video identifiers
1503             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504                 video_id = mobj.group(1)
1505                 if video_id not in video_ids:
1506                     video_ids.append(video_id)
1507                     if len(video_ids) == n:
1508                         # Specified n videos reached
1509                         for id in video_ids:
1510                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511                         return
1512
1513             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514                 for id in video_ids:
1515                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1516                 return
1517
1518             pagenum = pagenum + 1
1519
1520
1521 class YahooSearchIE(InfoExtractor):
1522     """Information Extractor for Yahoo! Video search queries."""
1523
1524     _WORKING = False
1525     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528     _MORE_PAGES_INDICATOR = r'\s*Next'
1529     _max_yahoo_results = 1000
1530     IE_NAME = u'video.yahoo:search'
1531
1532     def __init__(self, downloader=None):
1533         InfoExtractor.__init__(self, downloader)
1534
1535     def report_download_page(self, query, pagenum):
1536         """Report attempt to download playlist page with given number."""
1537         query = query.decode(preferredencoding())
1538         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1539
1540     def _real_extract(self, query):
1541         mobj = re.match(self._VALID_URL, query)
1542         if mobj is None:
1543             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1544             return
1545
1546         prefix, query = query.split(':')
1547         prefix = prefix[8:]
1548         query = query.encode('utf-8')
1549         if prefix == '':
1550             self._download_n_results(query, 1)
1551             return
1552         elif prefix == 'all':
1553             self._download_n_results(query, self._max_yahoo_results)
1554             return
1555         else:
1556             try:
1557                 n = int(prefix)
1558                 if n <= 0:
1559                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1560                     return
1561                 elif n > self._max_yahoo_results:
1562                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563                     n = self._max_yahoo_results
1564                 self._download_n_results(query, n)
1565                 return
1566             except ValueError: # parsing prefix as integer fails
1567                 self._download_n_results(query, 1)
1568                 return
1569
1570     def _download_n_results(self, query, n):
1571         """Downloads a specified number of results for a query"""
1572
1573         video_ids = []
1574         already_seen = set()
1575         pagenum = 1
1576
1577         while True:
1578             self.report_download_page(query, pagenum)
1579             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580             request = compat_urllib_request.Request(result_url)
1581             try:
1582                 page = compat_urllib_request.urlopen(request).read()
1583             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1585                 return
1586
1587             # Extract video identifiers
1588             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589                 video_id = mobj.group(1)
1590                 if video_id not in already_seen:
1591                     video_ids.append(video_id)
1592                     already_seen.add(video_id)
1593                     if len(video_ids) == n:
1594                         # Specified n videos reached
1595                         for id in video_ids:
1596                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597                         return
1598
1599             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600                 for id in video_ids:
1601                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1602                 return
1603
1604             pagenum = pagenum + 1
1605
1606
1607 class YoutubePlaylistIE(InfoExtractor):
1608     """Information Extractor for YouTube playlists."""
1609
1610     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1613     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614     IE_NAME = u'youtube:playlist'
1615
1616     def __init__(self, downloader=None):
1617         InfoExtractor.__init__(self, downloader)
1618
1619     def report_download_page(self, playlist_id, pagenum):
1620         """Report attempt to download playlist page with given number."""
1621         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1622
1623     def _real_extract(self, url):
1624         # Extract playlist id
1625         mobj = re.match(self._VALID_URL, url)
1626         if mobj is None:
1627             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1628             return
1629
1630         # Single video case
1631         if mobj.group(3) is not None:
1632             self._downloader.download([mobj.group(3)])
1633             return
1634
1635         # Download playlist pages
1636         # prefix is 'p' as default for playlists but there are other types that need extra care
1637         playlist_prefix = mobj.group(1)
1638         if playlist_prefix == 'a':
1639             playlist_access = 'artist'
1640         else:
1641             playlist_prefix = 'p'
1642             playlist_access = 'view_play_list'
1643         playlist_id = mobj.group(2)
1644         video_ids = []
1645         pagenum = 1
1646
1647         while True:
1648             self.report_download_page(playlist_id, pagenum)
1649             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650             request = compat_urllib_request.Request(url)
1651             try:
1652                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1655                 return
1656
1657             # Extract video identifiers
1658             ids_in_page = []
1659             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660                 if mobj.group(1) not in ids_in_page:
1661                     ids_in_page.append(mobj.group(1))
1662             video_ids.extend(ids_in_page)
1663
1664             if self._MORE_PAGES_INDICATOR not in page:
1665                 break
1666             pagenum = pagenum + 1
1667
1668         total = len(video_ids)
1669
1670         playliststart = self._downloader.params.get('playliststart', 1) - 1
1671         playlistend = self._downloader.params.get('playlistend', -1)
1672         if playlistend == -1:
1673             video_ids = video_ids[playliststart:]
1674         else:
1675             video_ids = video_ids[playliststart:playlistend]
1676
1677         if len(video_ids) == total:
1678             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1679         else:
1680             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1681
1682         for id in video_ids:
1683             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1684         return
1685
1686
1687 class YoutubeChannelIE(InfoExtractor):
1688     """Information Extractor for YouTube channels."""
1689
1690     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693     IE_NAME = u'youtube:channel'
1694
1695     def report_download_page(self, channel_id, pagenum):
1696         """Report attempt to download channel page with given number."""
1697         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1698
1699     def _real_extract(self, url):
1700         # Extract channel id
1701         mobj = re.match(self._VALID_URL, url)
1702         if mobj is None:
1703             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1704             return
1705
1706         # Download channel pages
1707         channel_id = mobj.group(1)
1708         video_ids = []
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(channel_id, pagenum)
1713             url = self._TEMPLATE_URL % (channel_id, pagenum)
1714             request = compat_urllib_request.Request(url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             ids_in_page = []
1723             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724                 if mobj.group(1) not in ids_in_page:
1725                     ids_in_page.append(mobj.group(1))
1726             video_ids.extend(ids_in_page)
1727
1728             if self._MORE_PAGES_INDICATOR not in page:
1729                 break
1730             pagenum = pagenum + 1
1731
1732         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1733
1734         for id in video_ids:
1735             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736         return
1737
1738
1739 class YoutubeUserIE(InfoExtractor):
1740     """Information Extractor for YouTube users."""
1741
1742     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744     _GDATA_PAGE_SIZE = 50
1745     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747     IE_NAME = u'youtube:user'
1748
1749     def __init__(self, downloader=None):
1750         InfoExtractor.__init__(self, downloader)
1751
1752     def report_download_page(self, username, start_index):
1753         """Report attempt to download user page."""
1754         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1756
1757     def _real_extract(self, url):
1758         # Extract username
1759         mobj = re.match(self._VALID_URL, url)
1760         if mobj is None:
1761             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1762             return
1763
1764         username = mobj.group(1)
1765
1766         # Download video ids using YouTube Data API. Result size per
1767         # query is limited (currently to 50 videos) so we need to query
1768         # page by page until there are no video ids - it means we got
1769         # all of them.
1770
1771         video_ids = []
1772         pagenum = 0
1773
1774         while True:
1775             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776             self.report_download_page(username, start_index)
1777
1778             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1779
1780             try:
1781                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1784                 return
1785
1786             # Extract video identifiers
1787             ids_in_page = []
1788
1789             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790                 if mobj.group(1) not in ids_in_page:
1791                     ids_in_page.append(mobj.group(1))
1792
1793             video_ids.extend(ids_in_page)
1794
1795             # A little optimization - if current page is not
1796             # "full", ie. does not contain PAGE_SIZE video ids then
1797             # we can assume that this page is the last one - there
1798             # are no more ids on further pages - no need to query
1799             # again.
1800
1801             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1802                 break
1803
1804             pagenum += 1
1805
1806         all_ids_count = len(video_ids)
1807         playliststart = self._downloader.params.get('playliststart', 1) - 1
1808         playlistend = self._downloader.params.get('playlistend', -1)
1809
1810         if playlistend == -1:
1811             video_ids = video_ids[playliststart:]
1812         else:
1813             video_ids = video_ids[playliststart:playlistend]
1814
1815         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816                 (username, all_ids_count, len(video_ids)))
1817
1818         for video_id in video_ids:
1819             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1820
1821
1822 class BlipTVUserIE(InfoExtractor):
1823     """Information Extractor for blip.tv users."""
1824
1825     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1826     _PAGE_SIZE = 12
1827     IE_NAME = u'blip.tv:user'
1828
1829     def __init__(self, downloader=None):
1830         InfoExtractor.__init__(self, downloader)
1831
1832     def report_download_page(self, username, pagenum):
1833         """Report attempt to download user page."""
1834         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835                 (self.IE_NAME, username, pagenum))
1836
1837     def _real_extract(self, url):
1838         # Extract username
1839         mobj = re.match(self._VALID_URL, url)
1840         if mobj is None:
1841             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1842             return
1843
1844         username = mobj.group(1)
1845
1846         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1847
1848         request = compat_urllib_request.Request(url)
1849
1850         try:
1851             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852             mobj = re.search(r'data-users-id="([^"]+)"', page)
1853             page_base = page_base % mobj.group(1)
1854         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1856             return
1857
1858
1859         # Download video ids using BlipTV Ajax calls. Result size per
1860         # query is limited (currently to 12 videos) so we need to query
1861         # page by page until there are no video ids - it means we got
1862         # all of them.
1863
1864         video_ids = []
1865         pagenum = 1
1866
1867         while True:
1868             self.report_download_page(username, pagenum)
1869
1870             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1871
1872             try:
1873                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1876                 return
1877
1878             # Extract video identifiers
1879             ids_in_page = []
1880
1881             for mobj in re.finditer(r'href="/([^"]+)"', page):
1882                 if mobj.group(1) not in ids_in_page:
1883                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1884
1885             video_ids.extend(ids_in_page)
1886
1887             # A little optimization - if current page is not
1888             # "full", ie. does not contain PAGE_SIZE video ids then
1889             # we can assume that this page is the last one - there
1890             # are no more ids on further pages - no need to query
1891             # again.
1892
1893             if len(ids_in_page) < self._PAGE_SIZE:
1894                 break
1895
1896             pagenum += 1
1897
1898         all_ids_count = len(video_ids)
1899         playliststart = self._downloader.params.get('playliststart', 1) - 1
1900         playlistend = self._downloader.params.get('playlistend', -1)
1901
1902         if playlistend == -1:
1903             video_ids = video_ids[playliststart:]
1904         else:
1905             video_ids = video_ids[playliststart:playlistend]
1906
1907         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1909
1910         for video_id in video_ids:
1911             self._downloader.download([u'http://blip.tv/'+video_id])
1912
1913
1914 class DepositFilesIE(InfoExtractor):
1915     """Information extractor for depositfiles.com"""
1916
1917     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1918
1919     def report_download_webpage(self, file_id):
1920         """Report webpage download."""
1921         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1922
1923     def report_extraction(self, file_id):
1924         """Report information extraction."""
1925         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1926
1927     def _real_extract(self, url):
1928         file_id = url.split('/')[-1]
1929         # Rebuild url in english locale
1930         url = 'http://depositfiles.com/en/files/' + file_id
1931
1932         # Retrieve file webpage with 'Free download' button pressed
1933         free_download_indication = { 'gateway_result' : '1' }
1934         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1935         try:
1936             self.report_download_webpage(file_id)
1937             webpage = compat_urllib_request.urlopen(request).read()
1938         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1940             return
1941
1942         # Search for the real file URL
1943         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944         if (mobj is None) or (mobj.group(1) is None):
1945             # Try to figure out reason of the error.
1946             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947             if (mobj is not None) and (mobj.group(1) is not None):
1948                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1950             else:
1951                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1952             return
1953
1954         file_url = mobj.group(1)
1955         file_extension = os.path.splitext(file_url)[1][1:]
1956
1957         # Search for file title
1958         mobj = re.search(r'<b title="(.*?)">', webpage)
1959         if mobj is None:
1960             self._downloader.trouble(u'ERROR: unable to extract title')
1961             return
1962         file_title = mobj.group(1).decode('utf-8')
1963
1964         return [{
1965             'id':       file_id.decode('utf-8'),
1966             'url':      file_url.decode('utf-8'),
1967             'uploader': None,
1968             'upload_date':  None,
1969             'title':    file_title,
1970             'ext':      file_extension.decode('utf-8'),
1971         }]
1972
1973
1974 class FacebookIE(InfoExtractor):
1975     """Information Extractor for Facebook"""
1976
1977     _WORKING = False
1978     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980     _NETRC_MACHINE = 'facebook'
1981     _available_formats = ['video', 'highqual', 'lowqual']
1982     _video_extensions = {
1983         'video': 'mp4',
1984         'highqual': 'mp4',
1985         'lowqual': 'mp4',
1986     }
1987     IE_NAME = u'facebook'
1988
1989     def __init__(self, downloader=None):
1990         InfoExtractor.__init__(self, downloader)
1991
1992     def _reporter(self, message):
1993         """Add header and report message."""
1994         self._downloader.to_screen(u'[facebook] %s' % message)
1995
1996     def report_login(self):
1997         """Report attempt to log in."""
1998         self._reporter(u'Logging in')
1999
2000     def report_video_webpage_download(self, video_id):
2001         """Report attempt to download video webpage."""
2002         self._reporter(u'%s: Downloading video webpage' % video_id)
2003
2004     def report_information_extraction(self, video_id):
2005         """Report attempt to extract video information."""
2006         self._reporter(u'%s: Extracting video information' % video_id)
2007
2008     def _parse_page(self, video_webpage):
2009         """Extract video information from page"""
2010         # General data
2011         data = {'title': r'\("video_title", "(.*?)"\)',
2012             'description': r'<div class="datawrap">(.*?)</div>',
2013             'owner': r'\("video_owner_name", "(.*?)"\)',
2014             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2015             }
2016         video_info = {}
2017         for piece in data.keys():
2018             mobj = re.search(data[piece], video_webpage)
2019             if mobj is not None:
2020                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2021
2022         # Video urls
2023         video_urls = {}
2024         for fmt in self._available_formats:
2025             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026             if mobj is not None:
2027                 # URL is in a Javascript segment inside an escaped Unicode format within
2028                 # the generally utf-8 page
2029                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030         video_info['video_urls'] = video_urls
2031
2032         return video_info
2033
2034     def _real_initialize(self):
2035         if self._downloader is None:
2036             return
2037
2038         useremail = None
2039         password = None
2040         downloader_params = self._downloader.params
2041
2042         # Attempt to use provided username and password or .netrc data
2043         if downloader_params.get('username', None) is not None:
2044             useremail = downloader_params['username']
2045             password = downloader_params['password']
2046         elif downloader_params.get('usenetrc', False):
2047             try:
2048                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049                 if info is not None:
2050                     useremail = info[0]
2051                     password = info[2]
2052                 else:
2053                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054             except (IOError, netrc.NetrcParseError) as err:
2055                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2056                 return
2057
2058         if useremail is None:
2059             return
2060
2061         # Log in
2062         login_form = {
2063             'email': useremail,
2064             'pass': password,
2065             'login': 'Log+In'
2066             }
2067         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2068         try:
2069             self.report_login()
2070             login_results = compat_urllib_request.urlopen(request).read()
2071             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2073                 return
2074         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2076             return
2077
2078     def _real_extract(self, url):
2079         mobj = re.match(self._VALID_URL, url)
2080         if mobj is None:
2081             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2082             return
2083         video_id = mobj.group('ID')
2084
2085         # Get video webpage
2086         self.report_video_webpage_download(video_id)
2087         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2088         try:
2089             page = compat_urllib_request.urlopen(request)
2090             video_webpage = page.read()
2091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2093             return
2094
2095         # Start extracting information
2096         self.report_information_extraction(video_id)
2097
2098         # Extract information
2099         video_info = self._parse_page(video_webpage)
2100
2101         # uploader
2102         if 'owner' not in video_info:
2103             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2104             return
2105         video_uploader = video_info['owner']
2106
2107         # title
2108         if 'title' not in video_info:
2109             self._downloader.trouble(u'ERROR: unable to extract video title')
2110             return
2111         video_title = video_info['title']
2112         video_title = video_title.decode('utf-8')
2113
2114         # thumbnail image
2115         if 'thumbnail' not in video_info:
2116             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117             video_thumbnail = ''
2118         else:
2119             video_thumbnail = video_info['thumbnail']
2120
2121         # upload date
2122         upload_date = None
2123         if 'upload_date' in video_info:
2124             upload_time = video_info['upload_date']
2125             timetuple = email.utils.parsedate_tz(upload_time)
2126             if timetuple is not None:
2127                 try:
2128                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2129                 except:
2130                     pass
2131
2132         # description
2133         video_description = video_info.get('description', 'No description available.')
2134
2135         url_map = video_info['video_urls']
2136         if url_map:
2137             # Decide which formats to download
2138             req_format = self._downloader.params.get('format', None)
2139             format_limit = self._downloader.params.get('format_limit', None)
2140
2141             if format_limit is not None and format_limit in self._available_formats:
2142                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2143             else:
2144                 format_list = self._available_formats
2145             existing_formats = [x for x in format_list if x in url_map]
2146             if len(existing_formats) == 0:
2147                 self._downloader.trouble(u'ERROR: no known formats available for video')
2148                 return
2149             if req_format is None:
2150                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151             elif req_format == 'worst':
2152                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153             elif req_format == '-1':
2154                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2155             else:
2156                 # Specific format
2157                 if req_format not in url_map:
2158                     self._downloader.trouble(u'ERROR: requested format not available')
2159                     return
2160                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2161
2162         results = []
2163         for format_param, video_real_url in video_url_list:
2164             # Extension
2165             video_extension = self._video_extensions.get(format_param, 'mp4')
2166
2167             results.append({
2168                 'id':       video_id.decode('utf-8'),
2169                 'url':      video_real_url.decode('utf-8'),
2170                 'uploader': video_uploader.decode('utf-8'),
2171                 'upload_date':  upload_date,
2172                 'title':    video_title,
2173                 'ext':      video_extension.decode('utf-8'),
2174                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2175                 'thumbnail':    video_thumbnail.decode('utf-8'),
2176                 'description':  video_description.decode('utf-8'),
2177             })
2178         return results
2179
2180 class BlipTVIE(InfoExtractor):
2181     """Information extractor for blip.tv"""
2182
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185     IE_NAME = u'blip.tv'
2186
2187     def report_extraction(self, file_id):
2188         """Report information extraction."""
2189         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191     def report_direct_download(self, title):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195     def _real_extract(self, url):
2196         mobj = re.match(self._VALID_URL, url)
2197         if mobj is None:
2198             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2199             return
2200
2201         if '?' in url:
2202             cchar = '&'
2203         else:
2204             cchar = '?'
2205         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206         request = compat_urllib_request.Request(json_url)
2207         self.report_extraction(mobj.group(1))
2208         info = None
2209         try:
2210             urlh = compat_urllib_request.urlopen(request)
2211             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212                 basename = url.split('/')[-1]
2213                 title,ext = os.path.splitext(basename)
2214                 title = title.decode('UTF-8')
2215                 ext = ext.replace('.', '')
2216                 self.report_direct_download(title)
2217                 info = {
2218                     'id': title,
2219                     'url': url,
2220                     'uploader': None,
2221                     'upload_date': None,
2222                     'title': title,
2223                     'ext': ext,
2224                     'urlhandle': urlh
2225                 }
2226         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228             return
2229         if info is None: # Regular URL
2230             try:
2231                 json_code_bytes = urlh.read()
2232                 json_code = json_code_bytes.decode('utf-8')
2233             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235                 return
2236
2237             try:
2238                 json_data = json.loads(json_code)
2239                 if 'Post' in json_data:
2240                     data = json_data['Post']
2241                 else:
2242                     data = json_data
2243
2244                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245                 video_url = data['media']['url']
2246                 umobj = re.match(self._URL_EXT, video_url)
2247                 if umobj is None:
2248                     raise ValueError('Can not determine filename extension')
2249                 ext = umobj.group(1)
2250
2251                 info = {
2252                     'id': data['item_id'],
2253                     'url': video_url,
2254                     'uploader': data['display_name'],
2255                     'upload_date': upload_date,
2256                     'title': data['title'],
2257                     'ext': ext,
2258                     'format': data['media']['mimeType'],
2259                     'thumbnail': data['thumbnailUrl'],
2260                     'description': data['description'],
2261                     'player_url': data['embedUrl']
2262                 }
2263             except (ValueError,KeyError) as err:
2264                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2265                 return
2266
2267         std_headers['User-Agent'] = 'iTunes/10.6.1'
2268         return [info]
2269
2270
2271 class MyVideoIE(InfoExtractor):
2272     """Information Extractor for myvideo.de."""
2273
2274     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275     IE_NAME = u'myvideo'
2276
2277     def __init__(self, downloader=None):
2278         InfoExtractor.__init__(self, downloader)
2279
2280     def report_extraction(self, video_id):
2281         """Report information extraction."""
2282         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2283
2284     def _real_extract(self,url):
2285         mobj = re.match(self._VALID_URL, url)
2286         if mobj is None:
2287             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2288             return
2289
2290         video_id = mobj.group(1)
2291
2292         # Get video webpage
2293         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294         webpage = self._download_webpage(webpage_url, video_id)
2295
2296         self.report_extraction(video_id)
2297         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2298                  webpage)
2299         if mobj is None:
2300             self._downloader.trouble(u'ERROR: unable to extract media URL')
2301             return
2302         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2303
2304         mobj = re.search('<title>([^<]+)</title>', webpage)
2305         if mobj is None:
2306             self._downloader.trouble(u'ERROR: unable to extract title')
2307             return
2308
2309         video_title = mobj.group(1)
2310
2311         return [{
2312             'id':       video_id,
2313             'url':      video_url,
2314             'uploader': None,
2315             'upload_date':  None,
2316             'title':    video_title,
2317             'ext':      u'flv',
2318         }]
2319
2320 class ComedyCentralIE(InfoExtractor):
2321     """Information extractor for The Daily Show and Colbert Report """
2322
2323     # urls can be abbreviations like :thedailyshow or :colbert
2324     # urls for episodes like:
2325     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329                       |(https?://)?(www\.)?
2330                           (?P<showname>thedailyshow|colbertnation)\.com/
2331                          (full-episodes/(?P<episode>.*)|
2332                           (?P<clip>
2333                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2335                      $"""
2336
2337     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2338
2339     _video_extensions = {
2340         '3500': 'mp4',
2341         '2200': 'mp4',
2342         '1700': 'mp4',
2343         '1200': 'mp4',
2344         '750': 'mp4',
2345         '400': 'mp4',
2346     }
2347     _video_dimensions = {
2348         '3500': '1280x720',
2349         '2200': '960x540',
2350         '1700': '768x432',
2351         '1200': '640x360',
2352         '750': '512x288',
2353         '400': '384x216',
2354     }
2355
2356     def suitable(self, url):
2357         """Receives a URL and returns True if suitable for this IE."""
2358         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2359
2360     def report_extraction(self, episode_id):
2361         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2362
2363     def report_config_download(self, episode_id, media_id):
2364         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2365
2366     def report_index_download(self, episode_id):
2367         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2368
2369     def _print_formats(self, formats):
2370         print('Available formats:')
2371         for x in formats:
2372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2373
2374
2375     def _real_extract(self, url):
2376         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2377         if mobj is None:
2378             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2379             return
2380
2381         if mobj.group('shortname'):
2382             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2383                 url = u'http://www.thedailyshow.com/full-episodes/'
2384             else:
2385                 url = u'http://www.colbertnation.com/full-episodes/'
2386             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387             assert mobj is not None
2388
2389         if mobj.group('clip'):
2390             if mobj.group('showname') == 'thedailyshow':
2391                 epTitle = mobj.group('tdstitle')
2392             else:
2393                 epTitle = mobj.group('cntitle')
2394             dlNewest = False
2395         else:
2396             dlNewest = not mobj.group('episode')
2397             if dlNewest:
2398                 epTitle = mobj.group('showname')
2399             else:
2400                 epTitle = mobj.group('episode')
2401
2402         req = compat_urllib_request.Request(url)
2403         self.report_extraction(epTitle)
2404         try:
2405             htmlHandle = compat_urllib_request.urlopen(req)
2406             html = htmlHandle.read()
2407             webpage = html.decode('utf-8')
2408         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2409             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2410             return
2411         if dlNewest:
2412             url = htmlHandle.geturl()
2413             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414             if mobj is None:
2415                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2416                 return
2417             if mobj.group('episode') == '':
2418                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2419                 return
2420             epTitle = mobj.group('episode')
2421
2422         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2423
2424         if len(mMovieParams) == 0:
2425             # The Colbert Report embeds the information in a without
2426             # a URL prefix; so extract the alternate reference
2427             # and then add the URL prefix manually.
2428
2429             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2430             if len(altMovieParams) == 0:
2431                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2432                 return
2433             else:
2434                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2435
2436         uri = mMovieParams[0][1]
2437         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2438         self.report_index_download(epTitle)
2439         try:
2440             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2441         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2443             return
2444
2445         results = []
2446
2447         idoc = xml.etree.ElementTree.fromstring(indexXml)
2448         itemEls = idoc.findall('.//item')
2449         for partNum,itemEl in enumerate(itemEls):
2450             mediaId = itemEl.findall('./guid')[0].text
2451             shortMediaId = mediaId.split(':')[-1]
2452             showId = mediaId.split(':')[-2].replace('.com', '')
2453             officialTitle = itemEl.findall('./title')[0].text
2454             officialDate = itemEl.findall('./pubDate')[0].text
2455
2456             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2457                         compat_urllib_parse.urlencode({'uri': mediaId}))
2458             configReq = compat_urllib_request.Request(configUrl)
2459             self.report_config_download(epTitle, shortMediaId)
2460             try:
2461                 configXml = compat_urllib_request.urlopen(configReq).read()
2462             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2464                 return
2465
2466             cdoc = xml.etree.ElementTree.fromstring(configXml)
2467             turls = []
2468             for rendition in cdoc.findall('.//rendition'):
2469                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2470                 turls.append(finfo)
2471
2472             if len(turls) == 0:
2473                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2474                 continue
2475
2476             if self._downloader.params.get('listformats', None):
2477                 self._print_formats([i[0] for i in turls])
2478                 return
2479
2480             # For now, just pick the highest bitrate
2481             format,rtmp_video_url = turls[-1]
2482
2483             # Get the format arg from the arg stream
2484             req_format = self._downloader.params.get('format', None)
2485
2486             # Select format if we can find one
2487             for f,v in turls:
2488                 if f == req_format:
2489                     format, rtmp_video_url = f, v
2490                     break
2491
2492             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2493             if not m:
2494                 raise ExtractorError(u'Cannot transform RTMP url')
2495             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2496             video_url = base + m.group('finalid')
2497
2498             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2499             info = {
2500                 'id': shortMediaId,
2501                 'url': video_url,
2502                 'uploader': showId,
2503                 'upload_date': officialDate,
2504                 'title': effTitle,
2505                 'ext': 'mp4',
2506                 'format': format,
2507                 'thumbnail': None,
2508                 'description': officialTitle,
2509             }
2510             results.append(info)
2511
2512         return results
2513
2514
2515 class EscapistIE(InfoExtractor):
2516     """Information extractor for The Escapist """
2517
2518     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2519     IE_NAME = u'escapist'
2520
2521     def report_extraction(self, showName):
2522         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2523
2524     def report_config_download(self, showName):
2525         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2526
2527     def _real_extract(self, url):
2528         mobj = re.match(self._VALID_URL, url)
2529         if mobj is None:
2530             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2531             return
2532         showName = mobj.group('showname')
2533         videoId = mobj.group('episode')
2534
2535         self.report_extraction(showName)
2536         try:
2537             webPage = compat_urllib_request.urlopen(url)
2538             webPageBytes = webPage.read()
2539             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2540             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2541         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2542             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2543             return
2544
2545         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2546         description = unescapeHTML(descMatch.group(1))
2547         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2548         imgUrl = unescapeHTML(imgMatch.group(1))
2549         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2550         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2551         configUrlMatch = re.search('config=(.*)$', playerUrl)
2552         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2553
2554         self.report_config_download(showName)
2555         try:
2556             configJSON = compat_urllib_request.urlopen(configUrl)
2557             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2558             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2559         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2561             return
2562
2563         # Technically, it's JavaScript, not JSON
2564         configJSON = configJSON.replace("'", '"')
2565
2566         try:
2567             config = json.loads(configJSON)
2568         except (ValueError,) as err:
2569             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2570             return
2571
2572         playlist = config['playlist']
2573         videoUrl = playlist[1]['url']
2574
2575         info = {
2576             'id': videoId,
2577             'url': videoUrl,
2578             'uploader': showName,
2579             'upload_date': None,
2580             'title': showName,
2581             'ext': 'flv',
2582             'thumbnail': imgUrl,
2583             'description': description,
2584             'player_url': playerUrl,
2585         }
2586
2587         return [info]
2588
2589 class CollegeHumorIE(InfoExtractor):
2590     """Information extractor for collegehumor.com"""
2591
2592     _WORKING = False
2593     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2594     IE_NAME = u'collegehumor'
2595
2596     def report_manifest(self, video_id):
2597         """Report information extraction."""
2598         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2599
2600     def report_extraction(self, video_id):
2601         """Report information extraction."""
2602         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2603
2604     def _real_extract(self, url):
2605         mobj = re.match(self._VALID_URL, url)
2606         if mobj is None:
2607             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2608             return
2609         video_id = mobj.group('videoid')
2610
2611         info = {
2612             'id': video_id,
2613             'uploader': None,
2614             'upload_date': None,
2615         }
2616
2617         self.report_extraction(video_id)
2618         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2619         try:
2620             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2621         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2622             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2623             return
2624
2625         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2626         try:
2627             videoNode = mdoc.findall('./video')[0]
2628             info['description'] = videoNode.findall('./description')[0].text
2629             info['title'] = videoNode.findall('./caption')[0].text
2630             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2631             manifest_url = videoNode.findall('./file')[0].text
2632         except IndexError:
2633             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2634             return
2635
2636         manifest_url += '?hdcore=2.10.3'
2637         self.report_manifest(video_id)
2638         try:
2639             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2640         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2641             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2642             return
2643
2644         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2645         try:
2646             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2647             node_id = media_node.attrib['url']
2648             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2649         except IndexError as err:
2650             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2651             return
2652
2653         url_pr = compat_urllib_parse_urlparse(manifest_url)
2654         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2655
2656         info['url'] = url
2657         info['ext'] = 'f4f'
2658         return [info]
2659
2660
2661 class XVideosIE(InfoExtractor):
2662     """Information extractor for xvideos.com"""
2663
2664     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2665     IE_NAME = u'xvideos'
2666
2667     def report_extraction(self, video_id):
2668         """Report information extraction."""
2669         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2670
2671     def _real_extract(self, url):
2672         mobj = re.match(self._VALID_URL, url)
2673         if mobj is None:
2674             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2675             return
2676         video_id = mobj.group(1)
2677
2678         webpage = self._download_webpage(url, video_id)
2679
2680         self.report_extraction(video_id)
2681
2682
2683         # Extract video URL
2684         mobj = re.search(r'flv_url=(.+?)&', webpage)
2685         if mobj is None:
2686             self._downloader.trouble(u'ERROR: unable to extract video url')
2687             return
2688         video_url = compat_urllib_parse.unquote(mobj.group(1))
2689
2690
2691         # Extract title
2692         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2693         if mobj is None:
2694             self._downloader.trouble(u'ERROR: unable to extract video title')
2695             return
2696         video_title = mobj.group(1)
2697
2698
2699         # Extract video thumbnail
2700         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2701         if mobj is None:
2702             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2703             return
2704         video_thumbnail = mobj.group(0)
2705
2706         info = {
2707             'id': video_id,
2708             'url': video_url,
2709             'uploader': None,
2710             'upload_date': None,
2711             'title': video_title,
2712             'ext': 'flv',
2713             'thumbnail': video_thumbnail,
2714             'description': None,
2715         }
2716
2717         return [info]
2718
2719
2720 class SoundcloudIE(InfoExtractor):
2721     """Information extractor for soundcloud.com
2722        To access the media, the uid of the song and a stream token
2723        must be extracted from the page source and the script must make
2724        a request to media.soundcloud.com/crossdomain.xml. Then
2725        the media can be grabbed by requesting from an url composed
2726        of the stream token and uid
2727      """
2728
2729     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2730     IE_NAME = u'soundcloud'
2731
2732     def __init__(self, downloader=None):
2733         InfoExtractor.__init__(self, downloader)
2734
2735     def report_resolve(self, video_id):
2736         """Report information extraction."""
2737         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2738
2739     def report_extraction(self, video_id):
2740         """Report information extraction."""
2741         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2742
2743     def _real_extract(self, url):
2744         mobj = re.match(self._VALID_URL, url)
2745         if mobj is None:
2746             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2747             return
2748
2749         # extract uploader (which is in the url)
2750         uploader = mobj.group(1)
2751         # extract simple title (uploader + slug of song title)
2752         slug_title =  mobj.group(2)
2753         simple_title = uploader + u'-' + slug_title
2754
2755         self.report_resolve('%s/%s' % (uploader, slug_title))
2756
2757         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2758         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2759         request = compat_urllib_request.Request(resolv_url)
2760         try:
2761             info_json_bytes = compat_urllib_request.urlopen(request).read()
2762             info_json = info_json_bytes.decode('utf-8')
2763         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2764             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2765             return
2766
2767         info = json.loads(info_json)
2768         video_id = info['id']
2769         self.report_extraction('%s/%s' % (uploader, slug_title))
2770
2771         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2772         request = compat_urllib_request.Request(streams_url)
2773         try:
2774             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2775             stream_json = stream_json_bytes.decode('utf-8')
2776         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2777             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2778             return
2779
2780         streams = json.loads(stream_json)
2781         mediaURL = streams['http_mp3_128_url']
2782
2783         return [{
2784             'id':       info['id'],
2785             'url':      mediaURL,
2786             'uploader': info['user']['username'],
2787             'upload_date':  info['created_at'],
2788             'title':    info['title'],
2789             'ext':      u'mp3',
2790             'description': info['description'],
2791         }]
2792
2793
2794 class InfoQIE(InfoExtractor):
2795     """Information extractor for infoq.com"""
2796     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2797
2798     def report_extraction(self, video_id):
2799         """Report information extraction."""
2800         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2801
2802     def _real_extract(self, url):
2803         mobj = re.match(self._VALID_URL, url)
2804         if mobj is None:
2805             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806             return
2807
2808         webpage = self._download_webpage(url, video_id=url)
2809         self.report_extraction(url)
2810
2811         # Extract video URL
2812         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2813         if mobj is None:
2814             self._downloader.trouble(u'ERROR: unable to extract video url')
2815             return
2816         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2817         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2818
2819         # Extract title
2820         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2821         if mobj is None:
2822             self._downloader.trouble(u'ERROR: unable to extract video title')
2823             return
2824         video_title = mobj.group(1)
2825
2826         # Extract description
2827         video_description = u'No description available.'
2828         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2829         if mobj is not None:
2830             video_description = mobj.group(1)
2831
2832         video_filename = video_url.split('/')[-1]
2833         video_id, extension = video_filename.split('.')
2834
2835         info = {
2836             'id': video_id,
2837             'url': video_url,
2838             'uploader': None,
2839             'upload_date': None,
2840             'title': video_title,
2841             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2842             'thumbnail': None,
2843             'description': video_description,
2844         }
2845
2846         return [info]
2847
2848 class MixcloudIE(InfoExtractor):
2849     """Information extractor for www.mixcloud.com"""
2850
2851     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2852     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2853     IE_NAME = u'mixcloud'
2854
2855     def __init__(self, downloader=None):
2856         InfoExtractor.__init__(self, downloader)
2857
2858     def report_download_json(self, file_id):
2859         """Report JSON download."""
2860         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2861
2862     def report_extraction(self, file_id):
2863         """Report information extraction."""
2864         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2865
2866     def get_urls(self, jsonData, fmt, bitrate='best'):
2867         """Get urls from 'audio_formats' section in json"""
2868         file_url = None
2869         try:
2870             bitrate_list = jsonData[fmt]
2871             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2872                 bitrate = max(bitrate_list) # select highest
2873
2874             url_list = jsonData[fmt][bitrate]
2875         except TypeError: # we have no bitrate info.
2876             url_list = jsonData[fmt]
2877         return url_list
2878
2879     def check_urls(self, url_list):
2880         """Returns 1st active url from list"""
2881         for url in url_list:
2882             try:
2883                 compat_urllib_request.urlopen(url)
2884                 return url
2885             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886                 url = None
2887
2888         return None
2889
2890     def _print_formats(self, formats):
2891         print('Available formats:')
2892         for fmt in formats.keys():
2893             for b in formats[fmt]:
2894                 try:
2895                     ext = formats[fmt][b][0]
2896                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2897                 except TypeError: # we have no bitrate info
2898                     ext = formats[fmt][0]
2899                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2900                     break
2901
2902     def _real_extract(self, url):
2903         mobj = re.match(self._VALID_URL, url)
2904         if mobj is None:
2905             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2906             return
2907         # extract uploader & filename from url
2908         uploader = mobj.group(1).decode('utf-8')
2909         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2910
2911         # construct API request
2912         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2913         # retrieve .json file with links to files
2914         request = compat_urllib_request.Request(file_url)
2915         try:
2916             self.report_download_json(file_url)
2917             jsonData = compat_urllib_request.urlopen(request).read()
2918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2919             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2920             return
2921
2922         # parse JSON
2923         json_data = json.loads(jsonData)
2924         player_url = json_data['player_swf_url']
2925         formats = dict(json_data['audio_formats'])
2926
2927         req_format = self._downloader.params.get('format', None)
2928         bitrate = None
2929
2930         if self._downloader.params.get('listformats', None):
2931             self._print_formats(formats)
2932             return
2933
2934         if req_format is None or req_format == 'best':
2935             for format_param in formats.keys():
2936                 url_list = self.get_urls(formats, format_param)
2937                 # check urls
2938                 file_url = self.check_urls(url_list)
2939                 if file_url is not None:
2940                     break # got it!
2941         else:
2942             if req_format not in formats:
2943                 self._downloader.trouble(u'ERROR: format is not available')
2944                 return
2945
2946             url_list = self.get_urls(formats, req_format)
2947             file_url = self.check_urls(url_list)
2948             format_param = req_format
2949
2950         return [{
2951             'id': file_id.decode('utf-8'),
2952             'url': file_url.decode('utf-8'),
2953             'uploader': uploader.decode('utf-8'),
2954             'upload_date': None,
2955             'title': json_data['name'],
2956             'ext': file_url.split('.')[-1].decode('utf-8'),
2957             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2958             'thumbnail': json_data['thumbnail_url'],
2959             'description': json_data['description'],
2960             'player_url': player_url.decode('utf-8'),
2961         }]
2962
2963 class StanfordOpenClassroomIE(InfoExtractor):
2964     """Information extractor for Stanford's Open ClassRoom"""
2965
2966     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2967     IE_NAME = u'stanfordoc'
2968
2969     def report_download_webpage(self, objid):
2970         """Report information extraction."""
2971         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2972
2973     def report_extraction(self, video_id):
2974         """Report information extraction."""
2975         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2976
2977     def _real_extract(self, url):
2978         mobj = re.match(self._VALID_URL, url)
2979         if mobj is None:
2980             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2981             return
2982
2983         if mobj.group('course') and mobj.group('video'): # A specific video
2984             course = mobj.group('course')
2985             video = mobj.group('video')
2986             info = {
2987                 'id': course + '_' + video,
2988                 'uploader': None,
2989                 'upload_date': None,
2990             }
2991
2992             self.report_extraction(info['id'])
2993             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2994             xmlUrl = baseUrl + video + '.xml'
2995             try:
2996                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2997             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2998                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2999                 return
3000             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3001             try:
3002                 info['title'] = mdoc.findall('./title')[0].text
3003                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3004             except IndexError:
3005                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3006                 return
3007             info['ext'] = info['url'].rpartition('.')[2]
3008             return [info]
3009         elif mobj.group('course'): # A course page
3010             course = mobj.group('course')
3011             info = {
3012                 'id': course,
3013                 'type': 'playlist',
3014                 'uploader': None,
3015                 'upload_date': None,
3016             }
3017
3018             self.report_download_webpage(info['id'])
3019             try:
3020                 coursepage = compat_urllib_request.urlopen(url).read()
3021             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3023                 return
3024
3025             m = re.search('<h1>([^<]+)</h1>', coursepage)
3026             if m:
3027                 info['title'] = unescapeHTML(m.group(1))
3028             else:
3029                 info['title'] = info['id']
3030
3031             m = re.search('<description>([^<]+)</description>', coursepage)
3032             if m:
3033                 info['description'] = unescapeHTML(m.group(1))
3034
3035             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3036             info['list'] = [
3037                 {
3038                     'type': 'reference',
3039                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3040                 }
3041                     for vpage in links]
3042             results = []
3043             for entry in info['list']:
3044                 assert entry['type'] == 'reference'
3045                 results += self.extract(entry['url'])
3046             return results
3047
3048         else: # Root page
3049             info = {
3050                 'id': 'Stanford OpenClassroom',
3051                 'type': 'playlist',
3052                 'uploader': None,
3053                 'upload_date': None,
3054             }
3055
3056             self.report_download_webpage(info['id'])
3057             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3058             try:
3059                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3060             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3062                 return
3063
3064             info['title'] = info['id']
3065
3066             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3067             info['list'] = [
3068                 {
3069                     'type': 'reference',
3070                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3071                 }
3072                     for cpage in links]
3073
3074             results = []
3075             for entry in info['list']:
3076                 assert entry['type'] == 'reference'
3077                 results += self.extract(entry['url'])
3078             return results
3079
3080 class MTVIE(InfoExtractor):
3081     """Information extractor for MTV.com"""
3082
3083     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3084     IE_NAME = u'mtv'
3085
3086     def report_extraction(self, video_id):
3087         """Report information extraction."""
3088         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3089
3090     def _real_extract(self, url):
3091         mobj = re.match(self._VALID_URL, url)
3092         if mobj is None:
3093             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3094             return
3095         if not mobj.group('proto'):
3096             url = 'http://' + url
3097         video_id = mobj.group('videoid')
3098
3099         webpage = self._download_webpage(url, video_id)
3100
3101         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3102         if mobj is None:
3103             self._downloader.trouble(u'ERROR: unable to extract song name')
3104             return
3105         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3106         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3107         if mobj is None:
3108             self._downloader.trouble(u'ERROR: unable to extract performer')
3109             return
3110         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3111         video_title = performer + ' - ' + song_name
3112
3113         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3114         if mobj is None:
3115             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3116             return
3117         mtvn_uri = mobj.group(1)
3118
3119         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3120         if mobj is None:
3121             self._downloader.trouble(u'ERROR: unable to extract content id')
3122             return
3123         content_id = mobj.group(1)
3124
3125         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3126         self.report_extraction(video_id)
3127         request = compat_urllib_request.Request(videogen_url)
3128         try:
3129             metadataXml = compat_urllib_request.urlopen(request).read()
3130         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3131             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3132             return
3133
3134         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3135         renditions = mdoc.findall('.//rendition')
3136
3137         # For now, always pick the highest quality.
3138         rendition = renditions[-1]
3139
3140         try:
3141             _,_,ext = rendition.attrib['type'].partition('/')
3142             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3143             video_url = rendition.find('./src').text
3144         except KeyError:
3145             self._downloader.trouble('Invalid rendition field.')
3146             return
3147
3148         info = {
3149             'id': video_id,
3150             'url': video_url,
3151             'uploader': performer,
3152             'upload_date': None,
3153             'title': video_title,
3154             'ext': ext,
3155             'format': format,
3156         }
3157
3158         return [info]
3159
3160
3161 class YoukuIE(InfoExtractor):
3162     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3163
3164     def report_download_webpage(self, file_id):
3165         """Report webpage download."""
3166         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3167
3168     def report_extraction(self, file_id):
3169         """Report information extraction."""
3170         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3171
3172     def _gen_sid(self):
3173         nowTime = int(time.time() * 1000)
3174         random1 = random.randint(1000,1998)
3175         random2 = random.randint(1000,9999)
3176
3177         return "%d%d%d" %(nowTime,random1,random2)
3178
3179     def _get_file_ID_mix_string(self, seed):
3180         mixed = []
3181         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3182         seed = float(seed)
3183         for i in range(len(source)):
3184             seed  =  (seed * 211 + 30031 ) % 65536
3185             index  =  math.floor(seed / 65536 * len(source) )
3186             mixed.append(source[int(index)])
3187             source.remove(source[int(index)])
3188         #return ''.join(mixed)
3189         return mixed
3190
3191     def _get_file_id(self, fileId, seed):
3192         mixed = self._get_file_ID_mix_string(seed)
3193         ids = fileId.split('*')
3194         realId = []
3195         for ch in ids:
3196             if ch:
3197                 realId.append(mixed[int(ch)])
3198         return ''.join(realId)
3199
3200     def _real_extract(self, url):
3201         mobj = re.match(self._VALID_URL, url)
3202         if mobj is None:
3203             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3204             return
3205         video_id = mobj.group('ID')
3206
3207         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3208
3209         request = compat_urllib_request.Request(info_url, None, std_headers)
3210         try:
3211             self.report_download_webpage(video_id)
3212             jsondata = compat_urllib_request.urlopen(request).read()
3213         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3214             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3215             return
3216
3217         self.report_extraction(video_id)
3218         try:
3219             jsonstr = jsondata.decode('utf-8')
3220             config = json.loads(jsonstr)
3221
3222             video_title =  config['data'][0]['title']
3223             seed = config['data'][0]['seed']
3224
3225             format = self._downloader.params.get('format', None)
3226             supported_format = list(config['data'][0]['streamfileids'].keys())
3227
3228             if format is None or format == 'best':
3229                 if 'hd2' in supported_format:
3230                     format = 'hd2'
3231                 else:
3232                     format = 'flv'
3233                 ext = u'flv'
3234             elif format == 'worst':
3235                 format = 'mp4'
3236                 ext = u'mp4'
3237             else:
3238                 format = 'flv'
3239                 ext = u'flv'
3240
3241
3242             fileid = config['data'][0]['streamfileids'][format]
3243             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3244         except (UnicodeDecodeError, ValueError, KeyError):
3245             self._downloader.trouble(u'ERROR: unable to extract info section')
3246             return
3247
3248         files_info=[]
3249         sid = self._gen_sid()
3250         fileid = self._get_file_id(fileid, seed)
3251
3252         #column 8,9 of fileid represent the segment number
3253         #fileid[7:9] should be changed
3254         for index, key in enumerate(keys):
3255
3256             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3257             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3258
3259             info = {
3260                 'id': '%s_part%02d' % (video_id, index),
3261                 'url': download_url,
3262                 'uploader': None,
3263                 'upload_date': None,
3264                 'title': video_title,
3265                 'ext': ext,
3266             }
3267             files_info.append(info)
3268
3269         return files_info
3270
3271
3272 class XNXXIE(InfoExtractor):
3273     """Information extractor for xnxx.com"""
3274
3275     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3276     IE_NAME = u'xnxx'
3277     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3278     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3279     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3280
3281     def report_webpage(self, video_id):
3282         """Report information extraction"""
3283         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3284
3285     def report_extraction(self, video_id):
3286         """Report information extraction"""
3287         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3288
3289     def _real_extract(self, url):
3290         mobj = re.match(self._VALID_URL, url)
3291         if mobj is None:
3292             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3293             return
3294         video_id = mobj.group(1)
3295
3296         self.report_webpage(video_id)
3297
3298         # Get webpage content
3299         try:
3300             webpage_bytes = compat_urllib_request.urlopen(url).read()
3301             webpage = webpage_bytes.decode('utf-8')
3302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3303             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3304             return
3305
3306         result = re.search(self.VIDEO_URL_RE, webpage)
3307         if result is None:
3308             self._downloader.trouble(u'ERROR: unable to extract video url')
3309             return
3310         video_url = compat_urllib_parse.unquote(result.group(1))
3311
3312         result = re.search(self.VIDEO_TITLE_RE, webpage)
3313         if result is None:
3314             self._downloader.trouble(u'ERROR: unable to extract video title')
3315             return
3316         video_title = result.group(1)
3317
3318         result = re.search(self.VIDEO_THUMB_RE, webpage)
3319         if result is None:
3320             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3321             return
3322         video_thumbnail = result.group(1)
3323
3324         return [{
3325             'id': video_id,
3326             'url': video_url,
3327             'uploader': None,
3328             'upload_date': None,
3329             'title': video_title,
3330             'ext': 'flv',
3331             'thumbnail': video_thumbnail,
3332             'description': None,
3333         }]
3334
3335
3336 class GooglePlusIE(InfoExtractor):
3337     """Information extractor for plus.google.com."""
3338
3339     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3340     IE_NAME = u'plus.google'
3341
3342     def __init__(self, downloader=None):
3343         InfoExtractor.__init__(self, downloader)
3344
3345     def report_extract_entry(self, url):
3346         """Report downloading extry"""
3347         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3348
3349     def report_date(self, upload_date):
3350         """Report downloading extry"""
3351         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3352
3353     def report_uploader(self, uploader):
3354         """Report downloading extry"""
3355         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3356
3357     def report_title(self, video_title):
3358         """Report downloading extry"""
3359         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3360
3361     def report_extract_vid_page(self, video_page):
3362         """Report information extraction."""
3363         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3364
3365     def _real_extract(self, url):
3366         # Extract id from URL
3367         mobj = re.match(self._VALID_URL, url)
3368         if mobj is None:
3369             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3370             return
3371
3372         post_url = mobj.group(0)
3373         video_id = mobj.group(1)
3374
3375         video_extension = 'flv'
3376
3377         # Step 1, Retrieve post webpage to extract further information
3378         self.report_extract_entry(post_url)
3379         request = compat_urllib_request.Request(post_url)
3380         try:
3381             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3382         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3383             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3384             return
3385
3386         # Extract update date
3387         upload_date = None
3388         pattern = 'title="Timestamp">(.*?)</a>'
3389         mobj = re.search(pattern, webpage)
3390         if mobj:
3391             upload_date = mobj.group(1)
3392             # Convert timestring to a format suitable for filename
3393             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3394             upload_date = upload_date.strftime('%Y%m%d')
3395         self.report_date(upload_date)
3396
3397         # Extract uploader
3398         uploader = None
3399         pattern = r'rel\="author".*?>(.*?)</a>'
3400         mobj = re.search(pattern, webpage)
3401         if mobj:
3402             uploader = mobj.group(1)
3403         self.report_uploader(uploader)
3404
3405         # Extract title
3406         # Get the first line for title
3407         video_title = u'NA'
3408         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3409         mobj = re.search(pattern, webpage)
3410         if mobj:
3411             video_title = mobj.group(1)
3412         self.report_title(video_title)
3413
3414         # Step 2, Stimulate clicking the image box to launch video
3415         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3416         mobj = re.search(pattern, webpage)
3417         if mobj is None:
3418             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3419
3420         video_page = mobj.group(1)
3421         request = compat_urllib_request.Request(video_page)
3422         try:
3423             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3424         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3426             return
3427         self.report_extract_vid_page(video_page)
3428
3429
3430         # Extract video links on video page
3431         """Extract video links of all sizes"""
3432         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3433         mobj = re.findall(pattern, webpage)
3434         if len(mobj) == 0:
3435             self._downloader.trouble(u'ERROR: unable to extract video links')
3436
3437         # Sort in resolution
3438         links = sorted(mobj)
3439
3440         # Choose the lowest of the sort, i.e. highest resolution
3441         video_url = links[-1]
3442         # Only get the url. The resolution part in the tuple has no use anymore
3443         video_url = video_url[-1]
3444         # Treat escaped \u0026 style hex
3445         try:
3446             video_url = video_url.decode("unicode_escape")
3447         except AttributeError: # Python 3
3448             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3449
3450
3451         return [{
3452             'id':       video_id,
3453             'url':      video_url,
3454             'uploader': uploader,
3455             'upload_date':  upload_date,
3456             'title':    video_title,
3457             'ext':      video_extension,
3458         }]
3459
3460 class NBAIE(InfoExtractor):
3461     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3462     IE_NAME = u'nba'
3463
3464     def _real_extract(self, url):
3465         mobj = re.match(self._VALID_URL, url)
3466         if mobj is None:
3467             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3468             return
3469
3470         video_id = mobj.group(1)
3471         if video_id.endswith('/index.html'):
3472             video_id = video_id[:-len('/index.html')]
3473
3474         webpage = self._download_webpage(url, video_id)
3475
3476         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3477         def _findProp(rexp, default=None):
3478             m = re.search(rexp, webpage)
3479             if m:
3480                 return unescapeHTML(m.group(1))
3481             else:
3482                 return default
3483
3484         shortened_video_id = video_id.rpartition('/')[2]
3485         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3486         info = {
3487             'id': shortened_video_id,
3488             'url': video_url,
3489             'ext': 'mp4',
3490             'title': title,
3491             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3492             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3493         }
3494         return [info]
3495
3496 class JustinTVIE(InfoExtractor):
3497     """Information extractor for justin.tv and twitch.tv"""
3498     # TODO: One broadcast may be split into multiple videos. The key
3499     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3500     # starts at 1 and increases. Can we treat all parts as one video?
3501
3502     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3503         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3504     _JUSTIN_PAGE_LIMIT = 100
3505     IE_NAME = u'justin.tv'
3506
3507     def report_extraction(self, file_id):
3508         """Report information extraction."""
3509         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3510
3511     def report_download_page(self, channel, offset):
3512         """Report attempt to download a single page of videos."""
3513         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3514                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3515
3516     # Return count of items, list of *valid* items
3517     def _parse_page(self, url):
3518         try:
3519             urlh = compat_urllib_request.urlopen(url)
3520             webpage_bytes = urlh.read()
3521             webpage = webpage_bytes.decode('utf-8', 'ignore')
3522         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3523             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3524             return
3525
3526         response = json.loads(webpage)
3527         if type(response) != list:
3528             error_text = response.get('error', 'unknown error')
3529             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3530             return
3531         info = []
3532         for clip in response:
3533             video_url = clip['video_file_url']
3534             if video_url:
3535                 video_extension = os.path.splitext(video_url)[1][1:]
3536                 video_date = re.sub('-', '', clip['start_time'][:10])
3537                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3538                 info.append({
3539                     'id': clip['id'],
3540                     'url': video_url,
3541                     'title': clip['title'],
3542                     'uploader': clip.get('channel_name', video_uploader_id),
3543                     'uploader_id': video_uploader_id,
3544                     'upload_date': video_date,
3545                     'ext': video_extension,
3546                 })
3547         return (len(response), info)
3548
3549     def _real_extract(self, url):
3550         mobj = re.match(self._VALID_URL, url)
3551         if mobj is None:
3552             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3553             return
3554
3555         api = 'http://api.justin.tv'
3556         video_id = mobj.group(mobj.lastindex)
3557         paged = False
3558         if mobj.lastindex == 1:
3559             paged = True
3560             api += '/channel/archives/%s.json'
3561         else:
3562             api += '/broadcast/by_archive/%s.json'
3563         api = api % (video_id,)
3564
3565         self.report_extraction(video_id)
3566
3567         info = []
3568         offset = 0
3569         limit = self._JUSTIN_PAGE_LIMIT
3570         while True:
3571             if paged:
3572                 self.report_download_page(video_id, offset)
3573             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3574             page_count, page_info = self._parse_page(page_url)
3575             info.extend(page_info)
3576             if not paged or page_count != limit:
3577                 break
3578             offset += limit
3579         return info
3580
3581 class FunnyOrDieIE(InfoExtractor):
3582     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3583
3584     def _real_extract(self, url):
3585         mobj = re.match(self._VALID_URL, url)
3586         if mobj is None:
3587             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3588             return
3589
3590         video_id = mobj.group('id')
3591         webpage = self._download_webpage(url, video_id)
3592
3593         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3594         if not m:
3595             self._downloader.trouble(u'ERROR: unable to find video information')
3596         video_url = unescapeHTML(m.group('url'))
3597
3598         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3599         if not m:
3600             self._downloader.trouble(u'Cannot find video title')
3601         title = unescapeHTML(m.group('title'))
3602
3603         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3604         if m:
3605             desc = unescapeHTML(m.group('desc'))
3606         else:
3607             desc = None
3608
3609         info = {
3610             'id': video_id,
3611             'url': video_url,
3612             'ext': 'mp4',
3613             'title': title,
3614             'description': desc,
3615         }
3616         return [info]
3617
3618 class TweetReelIE(InfoExtractor):
3619     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3620
3621     def _real_extract(self, url):
3622         mobj = re.match(self._VALID_URL, url)
3623         if mobj is None:
3624             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3625             return
3626
3627         video_id = mobj.group('id')
3628         webpage = self._download_webpage(url, video_id)
3629
3630         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3631         if not m:
3632             self._downloader.trouble(u'ERROR: Cannot find status ID')
3633         status_id = m.group(1)
3634
3635         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3636         if not m:
3637             self._downloader.trouble(u'WARNING: Cannot find description')
3638         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3639
3640         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3641         if not m:
3642             self._downloader.trouble(u'ERROR: Cannot find uploader')
3643         uploader = unescapeHTML(m.group('uploader'))
3644         uploader_id = unescapeHTML(m.group('uploader_id'))
3645
3646         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3647         if not m:
3648             self._downloader.trouble(u'ERROR: Cannot find upload date')
3649         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3650
3651         title = desc
3652         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3653
3654         info = {
3655             'id': video_id,
3656             'url': video_url,
3657             'ext': 'mov',
3658             'title': title,
3659             'description': desc,
3660             'uploader': uploader,
3661             'uploader_id': uploader_id,
3662             'internal_id': status_id,
3663             'upload_date': upload_date
3664         }
3665         return [info]
3666
3667 class SteamIE(InfoExtractor):
3668     _VALID_URL = r"""http://store.steampowered.com/
3669                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3670                 (?P<gameID>\d+)/?
3671                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3672                 """
3673
3674     def suitable(self, url):
3675         """Receives a URL and returns True if suitable for this IE."""
3676         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3677
3678     def _real_extract(self, url):
3679         m = re.match(self._VALID_URL, url, re.VERBOSE)
3680         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3681         gameID = m.group('gameID')
3682         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3683         webpage = self._download_webpage(videourl, gameID)
3684         mweb = re.finditer(urlRE, webpage)
3685         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3686         titles = re.finditer(namesRE, webpage)
3687         videos = []
3688         for vid,vtitle in zip(mweb,titles):
3689             video_id = vid.group('videoID')
3690             title = vtitle.group('videoName')
3691             video_url = vid.group('videoURL')
3692             if not video_url:
3693                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3694             info = {
3695                 'id':video_id,
3696                 'url':video_url,
3697                 'ext': 'flv',
3698                 'title': unescapeHTML(title)
3699                   }
3700             videos.append(info)
3701         return videos
3702
3703 class UstreamIE(InfoExtractor):
3704     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3705     IE_NAME = u'ustream'
3706
3707     def _real_extract(self, url):
3708         m = re.match(self._VALID_URL, url)
3709         video_id = m.group('videoID')
3710         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3711         webpage = self._download_webpage(url, video_id)
3712         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3713         title = m.group('title')
3714         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3715         uploader = m.group('uploader')
3716         info = {
3717                 'id':video_id,
3718                 'url':video_url,
3719                 'ext': 'flv',
3720                 'title': title,
3721                 'uploader': uploader
3722                   }
3723         return [info]
3724
3725
3726 def gen_extractors():
3727     """ Return a list of an instance of every supported extractor.
3728     The order does matter; the first extractor matched is the one handling the URL.
3729     """
3730     return [
3731         YoutubePlaylistIE(),
3732         YoutubeChannelIE(),
3733         YoutubeUserIE(),
3734         YoutubeSearchIE(),
3735         YoutubeIE(),
3736         MetacafeIE(),
3737         DailymotionIE(),
3738         GoogleSearchIE(),
3739         PhotobucketIE(),
3740         YahooIE(),
3741         YahooSearchIE(),
3742         DepositFilesIE(),
3743         FacebookIE(),
3744         BlipTVUserIE(),
3745         BlipTVIE(),
3746         VimeoIE(),
3747         MyVideoIE(),
3748         ComedyCentralIE(),
3749         EscapistIE(),
3750         CollegeHumorIE(),
3751         XVideosIE(),
3752         SoundcloudIE(),
3753         InfoQIE(),
3754         MixcloudIE(),
3755         StanfordOpenClassroomIE(),
3756         MTVIE(),
3757         YoukuIE(),
3758         XNXXIE(),
3759         GooglePlusIE(),
3760         ArteTvIE(),
3761         NBAIE(),
3762         JustinTVIE(),
3763         FunnyOrDieIE(),
3764         TweetReelIE(),
3765         SteamIE(),
3766         UstreamIE(),
3767         GenericIE()
3768     ]
3769
3770