youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 106                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 107                              (?:                                                  # the various things that can precede the ID:
 108                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 109                                  |(?:                                             # or the v= param in all its forms
 110                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 111                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 112                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 113                                      v=
 114                                  )
 115                              )?                                                   # optional -> youtube.com/xxxx is OK
 116                          )?                                                       # all until now is optional -> you can pass the naked ID
 117                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 118                          (?(1).+)?                                                # if we found the ID, everything can follow
 119                          $"""
 120         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 121         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 122         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 123         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 124         _NETRC_MACHINE = 'youtube'
 125         # Listed in order of quality
 126         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 127         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 128         _video_extensions = {
 129                 '13': '3gp',
 130                 '17': 'mp4',
 131                 '18': 'mp4',
 132                 '22': 'mp4',
 133                 '37': 'mp4',
 134                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 135                 '43': 'webm',
 136                 '44': 'webm',
 137                 '45': 'webm',
 138                 '46': 'webm',
 139         }
 140         _video_dimensions = {
 141                 '5': '240x400',
 142                 '6': '???',
 143                 '13': '???',
 144                 '17': '144x176',
 145                 '18': '360x640',
 146                 '22': '720x1280',
 147                 '34': '360x640',
 148                 '35': '480x854',
 149                 '37': '1080x1920',
 150                 '38': '3072x4096',
 151                 '43': '360x640',
 152                 '44': '480x854',
 153                 '45': '720x1280',
 154                 '46': '1080x1920',
 155         }
 156         IE_NAME = u'youtube'
 157
 158         def suitable(self, url):
 159                 """Receives a URL and returns True if suitable for this IE."""
 160                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 161
 162         def report_lang(self):
 163                 """Report attempt to set language."""
 164                 self._downloader.to_screen(u'[youtube] Setting language')
 165
 166         def report_login(self):
 167                 """Report attempt to log in."""
 168                 self._downloader.to_screen(u'[youtube] Logging in')
 169
 170         def report_age_confirmation(self):
 171                 """Report attempt to confirm age."""
 172                 self._downloader.to_screen(u'[youtube] Confirming age')
 173
 174         def report_video_webpage_download(self, video_id):
 175                 """Report attempt to download video webpage."""
 176                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 177
 178         def report_video_info_webpage_download(self, video_id):
 179                 """Report attempt to download video info webpage."""
 180                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 181
 182         def report_video_subtitles_download(self, video_id):
 183                 """Report attempt to download video info webpage."""
 184                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 185
 186         def report_information_extraction(self, video_id):
 187                 """Report attempt to extract video information."""
 188                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 189
 190         def report_unavailable_format(self, video_id, format):
 191                 """Report extracted video URL."""
 192                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 193
 194         def report_rtmp_download(self):
 195                 """Indicate the download will use the RTMP protocol."""
 196                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 197
 198         def _closed_captions_xml_to_srt(self, xml_string):
 199                 srt = ''
 200                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 201                 # TODO parse xml instead of regex
 202                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 203                         if not dur: dur = '4'
 204                         start = float(start)
 205                         end = start + float(dur)
 206                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 207                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 208                         caption = unescapeHTML(caption)
 209                         caption = unescapeHTML(caption) # double cycle, intentional
 210                         srt += str(n+1) + '\n'
 211                         srt += start + ' --> ' + end + '\n'
 212                         srt += caption + '\n\n'
 213                 return srt
 214
 215         def _print_formats(self, formats):
 216                 print 'Available formats:'
 217                 for x in formats:
 218                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 219
 220         def _real_initialize(self):
 221                 if self._downloader is None:
 222                         return
 223
 224                 username = None
 225                 password = None
 226                 downloader_params = self._downloader.params
 227
 228                 # Attempt to use provided username and password or .netrc data
 229                 if downloader_params.get('username', None) is not None:
 230                         username = downloader_params['username']
 231                         password = downloader_params['password']
 232                 elif downloader_params.get('usenetrc', False):
 233                         try:
 234                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 235                                 if info is not None:
 236                                         username = info[0]
 237                                         password = info[2]
 238                                 else:
 239                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 240                         except (IOError, netrc.NetrcParseError), err:
 241                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 242                                 return
 243
 244                 # Set language
 245                 request = urllib2.Request(self._LANG_URL)
 246                 try:
 247                         self.report_lang()
 248                         urllib2.urlopen(request).read()
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 251                         return
 252
 253                 # No authentication to be performed
 254                 if username is None:
 255                         return
 256
 257                 # Log in
 258                 login_form = {
 259                                 'current_form': 'loginForm',
 260                                 'next':         '/',
 261                                 'action_login': 'Log In',
 262                                 'username':     username,
 263                                 'password':     password,
 264                                 }
 265                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 266                 try:
 267                         self.report_login()
 268                         login_results = urllib2.urlopen(request).read()
 269                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 270                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 271                                 return
 272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 273                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 274                         return
 275
 276                 # Confirm age
 277                 age_form = {
 278                                 'next_url':             '/',
 279                                 'action_confirm':       'Confirm',
 280                                 }
 281                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 282                 try:
 283                         self.report_age_confirmation()
 284                         age_results = urllib2.urlopen(request).read()
 285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 286                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 287                         return
 288
 289         def _real_extract(self, url):
 290                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 291                 mobj = re.search(self._NEXT_URL_RE, url)
 292                 if mobj:
 293                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 294
 295                 # Extract video id from URL
 296                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 297                 if mobj is None:
 298                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 299                         return
 300                 video_id = mobj.group(2)
 301
 302                 # Get video webpage
 303                 self.report_video_webpage_download(video_id)
 304                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 305                 try:
 306                         video_webpage = urllib2.urlopen(request).read()
 307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 308                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 309                         return
 310
 311                 # Attempt to extract SWF player URL
 312                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 313                 if mobj is not None:
 314                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 315                 else:
 316                         player_url = None
 317
 318                 # Get video info
 319                 self.report_video_info_webpage_download(video_id)
 320                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 321                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 322                                         % (video_id, el_type))
 323                         request = urllib2.Request(video_info_url)
 324                         try:
 325                                 video_info_webpage = urllib2.urlopen(request).read()
 326                                 video_info = parse_qs(video_info_webpage)
 327                                 if 'token' in video_info:
 328                                         break
 329                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 330                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 331                                 return
 332                 if 'token' not in video_info:
 333                         if 'reason' in video_info:
 334                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 335                         else:
 336                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 337                         return
 338
 339                 # Check for "rental" videos
 340                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 341                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 342                         return
 343
 344                 # Start extracting information
 345                 self.report_information_extraction(video_id)
 346
 347                 # uploader
 348                 if 'author' not in video_info:
 349                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 350                         return
 351                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 352
 353                 # title
 354                 if 'title' not in video_info:
 355                         self._downloader.trouble(u'ERROR: unable to extract video title')
 356                         return
 357                 video_title = urllib.unquote_plus(video_info['title'][0])
 358                 video_title = video_title.decode('utf-8')
 359
 360                 # thumbnail image
 361                 if 'thumbnail_url' not in video_info:
 362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 363                         video_thumbnail = ''
 364                 else:   # don't panic if we can't find it
 365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 366
 367                 # upload date
 368                 upload_date = u'NA'
 369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 370                 if mobj is not None:
 371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 373                         for expression in format_expressions:
 374                                 try:
 375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 376                                 except:
 377                                         pass
 378
 379                 # description
 380                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 381                 if video_description: video_description = clean_html(video_description)
 382                 else: video_description = ''
 383
 384                 # closed captions
 385                 video_subtitles = None
 386                 if self._downloader.params.get('writesubtitles', False):
 387                         try:
 388                                 self.report_video_subtitles_download(video_id)
 389                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 390                                 try:
 391                                         srt_list = urllib2.urlopen(request).read()
 392                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 393                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 394                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 395                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 396                                 if not srt_lang_list:
 397                                         raise Trouble(u'WARNING: video has no closed captions')
 398                                 if self._downloader.params.get('subtitleslang', False):
 399                                         srt_lang = self._downloader.params.get('subtitleslang')
 400                                 elif 'en' in srt_lang_list:
 401                                         srt_lang = 'en'
 402                                 else:
 403                                         srt_lang = srt_lang_list.keys()[0]
 404                                 if not srt_lang in srt_lang_list:
 405                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 406                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 407                                 try:
 408                                         srt_xml = urllib2.urlopen(request).read()
 409                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 410                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 411                                 if not srt_xml:
 412                                         raise Trouble(u'WARNING: unable to download video subtitles')
 413                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 414                         except Trouble as trouble:
 415                                 self._downloader.trouble(trouble[0])
 416
 417                 if 'length_seconds' not in video_info:
 418                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 419                         video_duration = ''
 420                 else:
 421                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
 422
 423                 # token
 424                 video_token = urllib.unquote_plus(video_info['token'][0])
 425
 426                 # Decide which formats to download
 427                 req_format = self._downloader.params.get('format', None)
 428
 429                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 430                         self.report_rtmp_download()
 431                         video_url_list = [(None, video_info['conn'][0])]
 432                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 433                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 434                         url_data = [parse_qs(uds) for uds in url_data_strs]
 435                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 436                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 437
 438                         format_limit = self._downloader.params.get('format_limit', None)
 439                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 440                         if format_limit is not None and format_limit in available_formats:
 441                                 format_list = available_formats[available_formats.index(format_limit):]
 442                         else:
 443                                 format_list = available_formats
 444                         existing_formats = [x for x in format_list if x in url_map]
 445                         if len(existing_formats) == 0:
 446                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 447                                 return
 448                         if self._downloader.params.get('listformats', None):
 449                                 self._print_formats(existing_formats)
 450                                 return
 451                         if req_format is None or req_format == 'best':
 452                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 453                         elif req_format == 'worst':
 454                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 455                         elif req_format in ('-1', 'all'):
 456                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 457                         else:
 458                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 459                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 460                                 req_formats = req_format.split('/')
 461                                 video_url_list = None
 462                                 for rf in req_formats:
 463                                         if rf in url_map:
 464                                                 video_url_list = [(rf, url_map[rf])]
 465                                                 break
 466                                 if video_url_list is None:
 467                                         self._downloader.trouble(u'ERROR: requested format not available')
 468                                         return
 469                 else:
 470                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 471                         return
 472
 473                 results = []
 474                 for format_param, video_real_url in video_url_list:
 475                         # Extension
 476                         video_extension = self._video_extensions.get(format_param, 'flv')
 477
 478                         results.append({
 479                                 'id':           video_id.decode('utf-8'),
 480                                 'url':          video_real_url.decode('utf-8'),
 481                                 'uploader':     video_uploader.decode('utf-8'),
 482                                 'upload_date':  upload_date,
 483                                 'title':        video_title,
 484                                 'ext':          video_extension.decode('utf-8'),
 485                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 486                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 487                                 'description':  video_description,
 488                                 'player_url':   player_url,
 489                                 'subtitles':    video_subtitles,
 490                                 'duration':             video_duration
 491                         })
 492                 return results
 493
 494
 495 class MetacafeIE(InfoExtractor):
 496         """Information Extractor for metacafe.com."""
 497
 498         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 499         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 500         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 501         IE_NAME = u'metacafe'
 502
 503         def __init__(self, downloader=None):
 504                 InfoExtractor.__init__(self, downloader)
 505
 506         def report_disclaimer(self):
 507                 """Report disclaimer retrieval."""
 508                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 509
 510         def report_age_confirmation(self):
 511                 """Report attempt to confirm age."""
 512                 self._downloader.to_screen(u'[metacafe] Confirming age')
 513
 514         def report_download_webpage(self, video_id):
 515                 """Report webpage download."""
 516                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 517
 518         def report_extraction(self, video_id):
 519                 """Report information extraction."""
 520                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 521
 522         def _real_initialize(self):
 523                 # Retrieve disclaimer
 524                 request = urllib2.Request(self._DISCLAIMER)
 525                 try:
 526                         self.report_disclaimer()
 527                         disclaimer = urllib2.urlopen(request).read()
 528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 529                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 530                         return
 531
 532                 # Confirm age
 533                 disclaimer_form = {
 534                         'filters': '0',
 535                         'submit': "Continue - I'm over 18",
 536                         }
 537                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 538                 try:
 539                         self.report_age_confirmation()
 540                         disclaimer = urllib2.urlopen(request).read()
 541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 542                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 543                         return
 544
 545         def _real_extract(self, url):
 546                 # Extract id and simplified title from URL
 547                 mobj = re.match(self._VALID_URL, url)
 548                 if mobj is None:
 549                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 550                         return
 551
 552                 video_id = mobj.group(1)
 553
 554                 # Check if video comes from YouTube
 555                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 556                 if mobj2 is not None:
 557                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 558                         return
 559
 560                 # Retrieve video webpage to extract further information
 561                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 562                 try:
 563                         self.report_download_webpage(video_id)
 564                         webpage = urllib2.urlopen(request).read()
 565                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 566                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 567                         return
 568
 569                 # Extract URL, uploader and title from webpage
 570                 self.report_extraction(video_id)
 571                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 572                 if mobj is not None:
 573                         mediaURL = urllib.unquote(mobj.group(1))
 574                         video_extension = mediaURL[-3:]
 575
 576                         # Extract gdaKey if available
 577                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 578                         if mobj is None:
 579                                 video_url = mediaURL
 580                         else:
 581                                 gdaKey = mobj.group(1)
 582                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 583                 else:
 584                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         vardict = parse_qs(mobj.group(1))
 589                         if 'mediaData' not in vardict:
 590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 591                                 return
 592                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 593                         if mobj is None:
 594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 595                                 return
 596                         mediaURL = mobj.group(1).replace('\\/', '/')
 597                         video_extension = mediaURL[-3:]
 598                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 599
 600                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 601                 if mobj is None:
 602                         self._downloader.trouble(u'ERROR: unable to extract title')
 603                         return
 604                 video_title = mobj.group(1).decode('utf-8')
 605
 606                 mobj = re.search(r'submitter=(.*?);', webpage)
 607                 if mobj is None:
 608                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 609                         return
 610                 video_uploader = mobj.group(1)
 611
 612                 return [{
 613                         'id':           video_id.decode('utf-8'),
 614                         'url':          video_url.decode('utf-8'),
 615                         'uploader':     video_uploader.decode('utf-8'),
 616                         'upload_date':  u'NA',
 617                         'title':        video_title,
 618                         'ext':          video_extension.decode('utf-8'),
 619                         'format':       u'NA',
 620                         'player_url':   None,
 621                 }]
 622
 623
 624 class DailymotionIE(InfoExtractor):
 625         """Information Extractor for Dailymotion"""
 626
 627         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 628         IE_NAME = u'dailymotion'
 629
 630         def __init__(self, downloader=None):
 631                 InfoExtractor.__init__(self, downloader)
 632
 633         def report_download_webpage(self, video_id):
 634                 """Report webpage download."""
 635                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 636
 637         def report_extraction(self, video_id):
 638                 """Report information extraction."""
 639                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 640
 641         def _real_extract(self, url):
 642                 # Extract id and simplified title from URL
 643                 mobj = re.match(self._VALID_URL, url)
 644                 if mobj is None:
 645                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 646                         return
 647
 648                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 649
 650                 video_extension = 'mp4'
 651
 652                 # Retrieve video webpage to extract further information
 653                 request = urllib2.Request(url)
 654                 request.add_header('Cookie', 'family_filter=off')
 655                 try:
 656                         self.report_download_webpage(video_id)
 657                         webpage = urllib2.urlopen(request).read()
 658                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 659                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 660                         return
 661
 662                 # Extract URL, uploader and title from webpage
 663                 self.report_extraction(video_id)
 664                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 665                 if mobj is None:
 666                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 667                         return
 668                 flashvars = urllib.unquote(mobj.group(1))
 669
 670                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 671                         if key in flashvars:
 672                                 max_quality = key
 673                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 674                                 break
 675                 else:
 676                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 677                         return
 678
 679                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 680                 if mobj is None:
 681                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 682                         return
 683
 684                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 685
 686                 # TODO: support choosing qualities
 687
 688                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 689                 if mobj is None:
 690                         self._downloader.trouble(u'ERROR: unable to extract title')
 691                         return
 692                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 693
 694                 video_uploader = u'NA'
 695                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 696                 if mobj is None:
 697                         # lookin for official user
 698                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 699                         if mobj_official is None:
 700                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 701                         else:
 702                                 video_uploader = mobj_official.group(1)
 703                 else:
 704                         video_uploader = mobj.group(1)
 705
 706                 video_upload_date = u'NA'
 707                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 708                 if mobj is not None:
 709                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 710
 711                 return [{
 712                         'id':           video_id.decode('utf-8'),
 713                         'url':          video_url.decode('utf-8'),
 714                         'uploader':     video_uploader.decode('utf-8'),
 715                         'upload_date':  video_upload_date,
 716                         'title':        video_title,
 717                         'ext':          video_extension.decode('utf-8'),
 718                         'format':       u'NA',
 719                         'player_url':   None,
 720                 }]
 721
 722
 723 class GoogleIE(InfoExtractor):
 724         """Information extractor for video.google.com."""
 725
 726         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 727         IE_NAME = u'video.google'
 728
 729         def __init__(self, downloader=None):
 730                 InfoExtractor.__init__(self, downloader)
 731
 732         def report_download_webpage(self, video_id):
 733                 """Report webpage download."""
 734                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 735
 736         def report_extraction(self, video_id):
 737                 """Report information extraction."""
 738                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 739
 740         def _real_extract(self, url):
 741                 # Extract id from URL
 742                 mobj = re.match(self._VALID_URL, url)
 743                 if mobj is None:
 744                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 745                         return
 746
 747                 video_id = mobj.group(1)
 748
 749                 video_extension = 'mp4'
 750
 751                 # Retrieve video webpage to extract further information
 752                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 753                 try:
 754                         self.report_download_webpage(video_id)
 755                         webpage = urllib2.urlopen(request).read()
 756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 757                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 758                         return
 759
 760                 # Extract URL, uploader, and title from webpage
 761                 self.report_extraction(video_id)
 762                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 763                 if mobj is None:
 764                         video_extension = 'flv'
 765                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 766                 if mobj is None:
 767                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 768                         return
 769                 mediaURL = urllib.unquote(mobj.group(1))
 770                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 771                 mediaURL = mediaURL.replace('\\x26', '\x26')
 772
 773                 video_url = mediaURL
 774
 775                 mobj = re.search(r'<title>(.*)</title>', webpage)
 776                 if mobj is None:
 777                         self._downloader.trouble(u'ERROR: unable to extract title')
 778                         return
 779                 video_title = mobj.group(1).decode('utf-8')
 780
 781                 # Extract video description
 782                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 783                 if mobj is None:
 784                         self._downloader.trouble(u'ERROR: unable to extract video description')
 785                         return
 786                 video_description = mobj.group(1).decode('utf-8')
 787                 if not video_description:
 788                         video_description = 'No description available.'
 789
 790                 # Extract video thumbnail
 791                 if self._downloader.params.get('forcethumbnail', False):
 792                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 793                         try:
 794                                 webpage = urllib2.urlopen(request).read()
 795                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 796                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 797                                 return
 798                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 799                         if mobj is None:
 800                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 801                                 return
 802                         video_thumbnail = mobj.group(1)
 803                 else:   # we need something to pass to process_info
 804                         video_thumbnail = ''
 805
 806                 return [{
 807                         'id':           video_id.decode('utf-8'),
 808                         'url':          video_url.decode('utf-8'),
 809                         'uploader':     u'NA',
 810                         'upload_date':  u'NA',
 811                         'title':        video_title,
 812                         'ext':          video_extension.decode('utf-8'),
 813                         'format':       u'NA',
 814                         'player_url':   None,
 815                 }]
 816
 817
 818 class PhotobucketIE(InfoExtractor):
 819         """Information extractor for photobucket.com."""
 820
 821         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 822         IE_NAME = u'photobucket'
 823
 824         def __init__(self, downloader=None):
 825                 InfoExtractor.__init__(self, downloader)
 826
 827         def report_download_webpage(self, video_id):
 828                 """Report webpage download."""
 829                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 830
 831         def report_extraction(self, video_id):
 832                 """Report information extraction."""
 833                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 834
 835         def _real_extract(self, url):
 836                 # Extract id from URL
 837                 mobj = re.match(self._VALID_URL, url)
 838                 if mobj is None:
 839                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 840                         return
 841
 842                 video_id = mobj.group(1)
 843
 844                 video_extension = 'flv'
 845
 846                 # Retrieve video webpage to extract further information
 847                 request = urllib2.Request(url)
 848                 try:
 849                         self.report_download_webpage(video_id)
 850                         webpage = urllib2.urlopen(request).read()
 851                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 852                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 853                         return
 854
 855                 # Extract URL, uploader, and title from webpage
 856                 self.report_extraction(video_id)
 857                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 858                 if mobj is None:
 859                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 860                         return
 861                 mediaURL = urllib.unquote(mobj.group(1))
 862
 863                 video_url = mediaURL
 864
 865                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 866                 if mobj is None:
 867                         self._downloader.trouble(u'ERROR: unable to extract title')
 868                         return
 869                 video_title = mobj.group(1).decode('utf-8')
 870
 871                 video_uploader = mobj.group(2).decode('utf-8')
 872
 873                 return [{
 874                         'id':           video_id.decode('utf-8'),
 875                         'url':          video_url.decode('utf-8'),
 876                         'uploader':     video_uploader,
 877                         'upload_date':  u'NA',
 878                         'title':        video_title,
 879                         'ext':          video_extension.decode('utf-8'),
 880                         'format':       u'NA',
 881                         'player_url':   None,
 882                 }]
 883
 884
 885 class YahooIE(InfoExtractor):
 886         """Information extractor for video.yahoo.com."""
 887
 888         # _VALID_URL matches all Yahoo! Video URLs
 889         # _VPAGE_URL matches only the extractable '/watch/' URLs
 890         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 891         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 892         IE_NAME = u'video.yahoo'
 893
 894         def __init__(self, downloader=None):
 895                 InfoExtractor.__init__(self, downloader)
 896
 897         def report_download_webpage(self, video_id):
 898                 """Report webpage download."""
 899                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 900
 901         def report_extraction(self, video_id):
 902                 """Report information extraction."""
 903                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 904
 905         def _real_extract(self, url, new_video=True):
 906                 # Extract ID from URL
 907                 mobj = re.match(self._VALID_URL, url)
 908                 if mobj is None:
 909                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 910                         return
 911
 912                 video_id = mobj.group(2)
 913                 video_extension = 'flv'
 914
 915                 # Rewrite valid but non-extractable URLs as
 916                 # extractable English language /watch/ URLs
 917                 if re.match(self._VPAGE_URL, url) is None:
 918                         request = urllib2.Request(url)
 919                         try:
 920                                 webpage = urllib2.urlopen(request).read()
 921                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 922                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 923                                 return
 924
 925                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 926                         if mobj is None:
 927                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 928                                 return
 929                         yahoo_id = mobj.group(1)
 930
 931                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 932                         if mobj is None:
 933                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 934                                 return
 935                         yahoo_vid = mobj.group(1)
 936
 937                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 938                         return self._real_extract(url, new_video=False)
 939
 940                 # Retrieve video webpage to extract further information
 941                 request = urllib2.Request(url)
 942                 try:
 943                         self.report_download_webpage(video_id)
 944                         webpage = urllib2.urlopen(request).read()
 945                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 946                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 947                         return
 948
 949                 # Extract uploader and title from webpage
 950                 self.report_extraction(video_id)
 951                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video title')
 954                         return
 955                 video_title = mobj.group(1).decode('utf-8')
 956
 957                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 958                 if mobj is None:
 959                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 960                         return
 961                 video_uploader = mobj.group(1).decode('utf-8')
 962
 963                 # Extract video thumbnail
 964                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 965                 if mobj is None:
 966                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 967                         return
 968                 video_thumbnail = mobj.group(1).decode('utf-8')
 969
 970                 # Extract video description
 971                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 972                 if mobj is None:
 973                         self._downloader.trouble(u'ERROR: unable to extract video description')
 974                         return
 975                 video_description = mobj.group(1).decode('utf-8')
 976                 if not video_description:
 977                         video_description = 'No description available.'
 978
 979                 # Extract video height and width
 980                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 981                 if mobj is None:
 982                         self._downloader.trouble(u'ERROR: unable to extract video height')
 983                         return
 984                 yv_video_height = mobj.group(1)
 985
 986                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 987                 if mobj is None:
 988                         self._downloader.trouble(u'ERROR: unable to extract video width')
 989                         return
 990                 yv_video_width = mobj.group(1)
 991
 992                 # Retrieve video playlist to extract media URL
 993                 # I'm not completely sure what all these options are, but we
 994                 # seem to need most of them, otherwise the server sends a 401.
 995                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 996                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 997                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 998                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 999                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000                 try:
1001                         self.report_download_webpage(video_id)
1002                         webpage = urllib2.urlopen(request).read()
1003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1005                         return
1006
1007                 # Extract media URL from playlist XML
1008                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011                         return
1012                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013                 video_url = unescapeHTML(video_url)
1014
1015                 return [{
1016                         'id':           video_id.decode('utf-8'),
1017                         'url':          video_url,
1018                         'uploader':     video_uploader,
1019                         'upload_date':  u'NA',
1020                         'title':        video_title,
1021                         'ext':          video_extension.decode('utf-8'),
1022                         'thumbnail':    video_thumbnail.decode('utf-8'),
1023                         'description':  video_description,
1024                         'thumbnail':    video_thumbnail,
1025                         'player_url':   None,
1026                 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030         """Information extractor for vimeo.com."""
1031
1032         # _VALID_URL matches Vimeo URLs
1033         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1034         IE_NAME = u'vimeo'
1035
1036         def __init__(self, downloader=None):
1037                 InfoExtractor.__init__(self, downloader)
1038
1039         def report_download_webpage(self, video_id):
1040                 """Report webpage download."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043         def report_extraction(self, video_id):
1044                 """Report information extraction."""
1045                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047         def _real_extract(self, url, new_video=True):
1048                 # Extract ID from URL
1049                 mobj = re.match(self._VALID_URL, url)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052                         return
1053
1054                 video_id = mobj.group(1)
1055
1056                 # Retrieve video webpage to extract further information
1057                 request = urllib2.Request(url, None, std_headers)
1058                 try:
1059                         self.report_download_webpage(video_id)
1060                         webpage = urllib2.urlopen(request).read()
1061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1063                         return
1064
1065                 # Now we begin extracting as much information as we can from what we
1066                 # retrieved. First we extract the information common to all extractors,
1067                 # and latter we extract those that are Vimeo specific.
1068                 self.report_extraction(video_id)
1069
1070                 # Extract the config JSON
1071                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072                 try:
1073                         config = json.loads(config)
1074                 except:
1075                         self._downloader.trouble(u'ERROR: unable to extract info section')
1076                         return
1077
1078                 # Extract title
1079                 video_title = config["video"]["title"]
1080
1081                 # Extract uploader
1082                 video_uploader = config["video"]["owner"]["name"]
1083
1084                 # Extract video thumbnail
1085                 video_thumbnail = config["video"]["thumbnail"]
1086
1087                 # Extract video description
1088                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089                 if video_description: video_description = clean_html(video_description)
1090                 else: video_description = ''
1091
1092                 # Extract upload date
1093                 video_upload_date = u'NA'
1094                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095                 if mobj is not None:
1096                         video_upload_date = mobj.group(1)
1097
1098                 # Vimeo specific: extract request signature and timestamp
1099                 sig = config['request']['signature']
1100                 timestamp = config['request']['timestamp']
1101
1102                 # Vimeo specific: extract video codec and quality information
1103                 # TODO bind to format param
1104                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1105                 for codec in codecs:
1106                         if codec[0] in config["video"]["files"]:
1107                                 video_codec = codec[0]
1108                                 video_extension = codec[1]
1109                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1110                                 else: quality = 'sd'
1111                                 break
1112                 else:
1113                         self._downloader.trouble(u'ERROR: no known codec found')
1114                         return
1115
1116                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1117                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1118
1119                 return [{
1120                         'id':           video_id,
1121                         'url':          video_url,
1122                         'uploader':     video_uploader,
1123                         'upload_date':  video_upload_date,
1124                         'title':        video_title,
1125                         'ext':          video_extension,
1126                         'thumbnail':    video_thumbnail,
1127                         'description':  video_description,
1128                         'player_url':   None,
1129                 }]
1130
1131
1132 class GenericIE(InfoExtractor):
1133         """Generic last-resort information extractor."""
1134
1135         _VALID_URL = r'.*'
1136         IE_NAME = u'generic'
1137
1138         def __init__(self, downloader=None):
1139                 InfoExtractor.__init__(self, downloader)
1140
1141         def report_download_webpage(self, video_id):
1142                 """Report webpage download."""
1143                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1144                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1145
1146         def report_extraction(self, video_id):
1147                 """Report information extraction."""
1148                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1149
1150         def report_following_redirect(self, new_url):
1151                 """Report information extraction."""
1152                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1153
1154         def _test_redirect(self, url):
1155                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1156                 class HeadRequest(urllib2.Request):
1157                         def get_method(self):
1158                                 return "HEAD"
1159
1160                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1161                         """
1162                         Subclass the HTTPRedirectHandler to make it use our
1163                         HeadRequest also on the redirected URL
1164                         """
1165                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1166                                 if code in (301, 302, 303, 307):
1167                                         newurl = newurl.replace(' ', '%20')
1168                                         newheaders = dict((k,v) for k,v in req.headers.items()
1169                                                                           if k.lower() not in ("content-length", "content-type"))
1170                                         return HeadRequest(newurl,
1171                                                                            headers=newheaders,
1172                                                                            origin_req_host=req.get_origin_req_host(),
1173                                                                            unverifiable=True)
1174                                 else:
1175                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1176
1177                 class HTTPMethodFallback(urllib2.BaseHandler):
1178                         """
1179                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1180                         """
1181                         def http_error_405(self, req, fp, code, msg, headers):
1182                                 fp.read()
1183                                 fp.close()
1184
1185                                 newheaders = dict((k,v) for k,v in req.headers.items()
1186                                                                   if k.lower() not in ("content-length", "content-type"))
1187                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1188                                                                                                  headers=newheaders,
1189                                                                                                  origin_req_host=req.get_origin_req_host(),
1190                                                                                                  unverifiable=True))
1191
1192                 # Build our opener
1193                 opener = urllib2.OpenerDirector()
1194                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1195                                                 HTTPMethodFallback, HEADRedirectHandler,
1196                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1197                         opener.add_handler(handler())
1198
1199                 response = opener.open(HeadRequest(url))
1200                 new_url = response.geturl()
1201
1202                 if url == new_url: return False
1203
1204                 self.report_following_redirect(new_url)
1205                 self._downloader.download([new_url])
1206                 return True
1207
1208         def _real_extract(self, url):
1209                 if self._test_redirect(url): return
1210
1211                 video_id = url.split('/')[-1]
1212                 request = urllib2.Request(url)
1213                 try:
1214                         self.report_download_webpage(video_id)
1215                         webpage = urllib2.urlopen(request).read()
1216                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1217                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1218                         return
1219                 except ValueError, err:
1220                         # since this is the last-resort InfoExtractor, if
1221                         # this error is thrown, it'll be thrown here
1222                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1223                         return
1224
1225                 self.report_extraction(video_id)
1226                 # Start with something easy: JW Player in SWFObject
1227                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1228                 if mobj is None:
1229                         # Broaden the search a little bit
1230                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1233                         return
1234
1235                 # It's possible that one of the regexes
1236                 # matched, but returned an empty group:
1237                 if mobj.group(1) is None:
1238                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1239                         return
1240
1241                 video_url = urllib.unquote(mobj.group(1))
1242                 video_id = os.path.basename(video_url)
1243
1244                 # here's a fun little line of code for you:
1245                 video_extension = os.path.splitext(video_id)[1][1:]
1246                 video_id = os.path.splitext(video_id)[0]
1247
1248                 # it's tempting to parse this further, but you would
1249                 # have to take into account all the variations like
1250                 #   Video Title - Site Name
1251                 #   Site Name | Video Title
1252                 #   Video Title - Tagline | Site Name
1253                 # and so on and so forth; it's just not practical
1254                 mobj = re.search(r'<title>(.*)</title>', webpage)
1255                 if mobj is None:
1256                         self._downloader.trouble(u'ERROR: unable to extract title')
1257                         return
1258                 video_title = mobj.group(1).decode('utf-8')
1259
1260                 # video uploader is domain name
1261                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: unable to extract title')
1264                         return
1265                 video_uploader = mobj.group(1).decode('utf-8')
1266
1267                 return [{
1268                         'id':           video_id.decode('utf-8'),
1269                         'url':          video_url.decode('utf-8'),
1270                         'uploader':     video_uploader,
1271                         'upload_date':  u'NA',
1272                         'title':        video_title,
1273                         'ext':          video_extension.decode('utf-8'),
1274                         'format':       u'NA',
1275                         'player_url':   None,
1276                 }]
1277
1278
1279 class YoutubeSearchIE(InfoExtractor):
1280         """Information Extractor for YouTube search queries."""
1281         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1282         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1283         _max_youtube_results = 1000
1284         IE_NAME = u'youtube:search'
1285
1286         def __init__(self, downloader=None):
1287                 InfoExtractor.__init__(self, downloader)
1288
1289         def report_download_page(self, query, pagenum):
1290                 """Report attempt to download search page with given number."""
1291                 query = query.decode(preferredencoding())
1292                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1293
1294         def _real_extract(self, query):
1295                 mobj = re.match(self._VALID_URL, query)
1296                 if mobj is None:
1297                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1298                         return
1299
1300                 prefix, query = query.split(':')
1301                 prefix = prefix[8:]
1302                 query = query.encode('utf-8')
1303                 if prefix == '':
1304                         self._download_n_results(query, 1)
1305                         return
1306                 elif prefix == 'all':
1307                         self._download_n_results(query, self._max_youtube_results)
1308                         return
1309                 else:
1310                         try:
1311                                 n = long(prefix)
1312                                 if n <= 0:
1313                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1314                                         return
1315                                 elif n > self._max_youtube_results:
1316                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1317                                         n = self._max_youtube_results
1318                                 self._download_n_results(query, n)
1319                                 return
1320                         except ValueError: # parsing prefix as integer fails
1321                                 self._download_n_results(query, 1)
1322                                 return
1323
1324         def _download_n_results(self, query, n):
1325                 """Downloads a specified number of results for a query"""
1326
1327                 video_ids = []
1328                 pagenum = 0
1329                 limit = n
1330
1331                 while (50 * pagenum) < limit:
1332                         self.report_download_page(query, pagenum+1)
1333                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1334                         request = urllib2.Request(result_url)
1335                         try:
1336                                 data = urllib2.urlopen(request).read()
1337                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1338                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1339                                 return
1340                         api_response = json.loads(data)['data']
1341
1342                         new_ids = list(video['id'] for video in api_response['items'])
1343                         video_ids += new_ids
1344
1345                         limit = min(n, api_response['totalItems'])
1346                         pagenum += 1
1347
1348                 if len(video_ids) > n:
1349                         video_ids = video_ids[:n]
1350                 for id in video_ids:
1351                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1352                 return
1353
1354
1355 class GoogleSearchIE(InfoExtractor):
1356         """Information Extractor for Google Video search queries."""
1357         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1358         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1359         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1360         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1361         _max_google_results = 1000
1362         IE_NAME = u'video.google:search'
1363
1364         def __init__(self, downloader=None):
1365                 InfoExtractor.__init__(self, downloader)
1366
1367         def report_download_page(self, query, pagenum):
1368                 """Report attempt to download playlist page with given number."""
1369                 query = query.decode(preferredencoding())
1370                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1371
1372         def _real_extract(self, query):
1373                 mobj = re.match(self._VALID_URL, query)
1374                 if mobj is None:
1375                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1376                         return
1377
1378                 prefix, query = query.split(':')
1379                 prefix = prefix[8:]
1380                 query = query.encode('utf-8')
1381                 if prefix == '':
1382                         self._download_n_results(query, 1)
1383                         return
1384                 elif prefix == 'all':
1385                         self._download_n_results(query, self._max_google_results)
1386                         return
1387                 else:
1388                         try:
1389                                 n = long(prefix)
1390                                 if n <= 0:
1391                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1392                                         return
1393                                 elif n > self._max_google_results:
1394                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1395                                         n = self._max_google_results
1396                                 self._download_n_results(query, n)
1397                                 return
1398                         except ValueError: # parsing prefix as integer fails
1399                                 self._download_n_results(query, 1)
1400                                 return
1401
1402         def _download_n_results(self, query, n):
1403                 """Downloads a specified number of results for a query"""
1404
1405                 video_ids = []
1406                 pagenum = 0
1407
1408                 while True:
1409                         self.report_download_page(query, pagenum)
1410                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1411                         request = urllib2.Request(result_url)
1412                         try:
1413                                 page = urllib2.urlopen(request).read()
1414                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1415                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1416                                 return
1417
1418                         # Extract video identifiers
1419                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1420                                 video_id = mobj.group(1)
1421                                 if video_id not in video_ids:
1422                                         video_ids.append(video_id)
1423                                         if len(video_ids) == n:
1424                                                 # Specified n videos reached
1425                                                 for id in video_ids:
1426                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1427                                                 return
1428
1429                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1430                                 for id in video_ids:
1431                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1432                                 return
1433
1434                         pagenum = pagenum + 1
1435
1436
1437 class YahooSearchIE(InfoExtractor):
1438         """Information Extractor for Yahoo! Video search queries."""
1439         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1440         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1441         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1442         _MORE_PAGES_INDICATOR = r'\s*Next'
1443         _max_yahoo_results = 1000
1444         IE_NAME = u'video.yahoo:search'
1445
1446         def __init__(self, downloader=None):
1447                 InfoExtractor.__init__(self, downloader)
1448
1449         def report_download_page(self, query, pagenum):
1450                 """Report attempt to download playlist page with given number."""
1451                 query = query.decode(preferredencoding())
1452                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1453
1454         def _real_extract(self, query):
1455                 mobj = re.match(self._VALID_URL, query)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1458                         return
1459
1460                 prefix, query = query.split(':')
1461                 prefix = prefix[8:]
1462                 query = query.encode('utf-8')
1463                 if prefix == '':
1464                         self._download_n_results(query, 1)
1465                         return
1466                 elif prefix == 'all':
1467                         self._download_n_results(query, self._max_yahoo_results)
1468                         return
1469                 else:
1470                         try:
1471                                 n = long(prefix)
1472                                 if n <= 0:
1473                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1474                                         return
1475                                 elif n > self._max_yahoo_results:
1476                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1477                                         n = self._max_yahoo_results
1478                                 self._download_n_results(query, n)
1479                                 return
1480                         except ValueError: # parsing prefix as integer fails
1481                                 self._download_n_results(query, 1)
1482                                 return
1483
1484         def _download_n_results(self, query, n):
1485                 """Downloads a specified number of results for a query"""
1486
1487                 video_ids = []
1488                 already_seen = set()
1489                 pagenum = 1
1490
1491                 while True:
1492                         self.report_download_page(query, pagenum)
1493                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1494                         request = urllib2.Request(result_url)
1495                         try:
1496                                 page = urllib2.urlopen(request).read()
1497                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1499                                 return
1500
1501                         # Extract video identifiers
1502                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1503                                 video_id = mobj.group(1)
1504                                 if video_id not in already_seen:
1505                                         video_ids.append(video_id)
1506                                         already_seen.add(video_id)
1507                                         if len(video_ids) == n:
1508                                                 # Specified n videos reached
1509                                                 for id in video_ids:
1510                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1511                                                 return
1512
1513                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514                                 for id in video_ids:
1515                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1516                                 return
1517
1518                         pagenum = pagenum + 1
1519
1520
1521 class YoutubePlaylistIE(InfoExtractor):
1522         """Information Extractor for YouTube playlists."""
1523
1524         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1525         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1526         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1527         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1528         IE_NAME = u'youtube:playlist'
1529
1530         def __init__(self, downloader=None):
1531                 InfoExtractor.__init__(self, downloader)
1532
1533         def report_download_page(self, playlist_id, pagenum):
1534                 """Report attempt to download playlist page with given number."""
1535                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1536
1537         def _real_extract(self, url):
1538                 # Extract playlist id
1539                 mobj = re.match(self._VALID_URL, url)
1540                 if mobj is None:
1541                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1542                         return
1543
1544                 # Single video case
1545                 if mobj.group(3) is not None:
1546                         self._downloader.download([mobj.group(3)])
1547                         return
1548
1549                 # Download playlist pages
1550                 # prefix is 'p' as default for playlists but there are other types that need extra care
1551                 playlist_prefix = mobj.group(1)
1552                 if playlist_prefix == 'a':
1553                         playlist_access = 'artist'
1554                 else:
1555                         playlist_prefix = 'p'
1556                         playlist_access = 'view_play_list'
1557                 playlist_id = mobj.group(2)
1558                 video_ids = []
1559                 pagenum = 1
1560
1561                 while True:
1562                         self.report_download_page(playlist_id, pagenum)
1563                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1564                         request = urllib2.Request(url)
1565                         try:
1566                                 page = urllib2.urlopen(request).read()
1567                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1568                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1569                                 return
1570
1571                         # Extract video identifiers
1572                         ids_in_page = []
1573                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1574                                 if mobj.group(1) not in ids_in_page:
1575                                         ids_in_page.append(mobj.group(1))
1576                         video_ids.extend(ids_in_page)
1577
1578                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1579                                 break
1580                         pagenum = pagenum + 1
1581
1582                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1583                 playlistend = self._downloader.params.get('playlistend', -1)
1584                 if playlistend == -1:
1585                         video_ids = video_ids[playliststart:]
1586                 else:
1587                         video_ids = video_ids[playliststart:playlistend]
1588
1589                 for id in video_ids:
1590                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1591                 return
1592
1593
1594 class YoutubeChannelIE(InfoExtractor):
1595         """Information Extractor for YouTube channels."""
1596
1597         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1598         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1599         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1600         IE_NAME = u'youtube:channel'
1601
1602         def report_download_page(self, channel_id, pagenum):
1603                 """Report attempt to download channel page with given number."""
1604                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1605
1606         def _real_extract(self, url):
1607                 # Extract channel id
1608                 mobj = re.match(self._VALID_URL, url)
1609                 if mobj is None:
1610                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1611                         return
1612
1613                 # Download channel pages
1614                 channel_id = mobj.group(1)
1615                 video_ids = []
1616                 pagenum = 1
1617
1618                 while True:
1619                         self.report_download_page(channel_id, pagenum)
1620                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1621                         request = urllib2.Request(url)
1622                         try:
1623                                 page = urllib2.urlopen(request).read()
1624                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1625                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1626                                 return
1627
1628                         # Extract video identifiers
1629                         ids_in_page = []
1630                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1631                                 if mobj.group(1) not in ids_in_page:
1632                                         ids_in_page.append(mobj.group(1))
1633                         video_ids.extend(ids_in_page)
1634
1635                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1636                                 break
1637                         pagenum = pagenum + 1
1638
1639                 for id in video_ids:
1640                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1641                 return
1642
1643
1644 class YoutubeUserIE(InfoExtractor):
1645         """Information Extractor for YouTube users."""
1646
1647         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1648         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1649         _GDATA_PAGE_SIZE = 50
1650         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1651         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1652         IE_NAME = u'youtube:user'
1653
1654         def __init__(self, downloader=None):
1655                 InfoExtractor.__init__(self, downloader)
1656
1657         def report_download_page(self, username, start_index):
1658                 """Report attempt to download user page."""
1659                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1660                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1661
1662         def _real_extract(self, url):
1663                 # Extract username
1664                 mobj = re.match(self._VALID_URL, url)
1665                 if mobj is None:
1666                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1667                         return
1668
1669                 username = mobj.group(1)
1670
1671                 # Download video ids using YouTube Data API. Result size per
1672                 # query is limited (currently to 50 videos) so we need to query
1673                 # page by page until there are no video ids - it means we got
1674                 # all of them.
1675
1676                 video_ids = []
1677                 pagenum = 0
1678
1679                 while True:
1680                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1681                         self.report_download_page(username, start_index)
1682
1683                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1684
1685                         try:
1686                                 page = urllib2.urlopen(request).read()
1687                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1688                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1689                                 return
1690
1691                         # Extract video identifiers
1692                         ids_in_page = []
1693
1694                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1695                                 if mobj.group(1) not in ids_in_page:
1696                                         ids_in_page.append(mobj.group(1))
1697
1698                         video_ids.extend(ids_in_page)
1699
1700                         # A little optimization - if current page is not
1701                         # "full", ie. does not contain PAGE_SIZE video ids then
1702                         # we can assume that this page is the last one - there
1703                         # are no more ids on further pages - no need to query
1704                         # again.
1705
1706                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1707                                 break
1708
1709                         pagenum += 1
1710
1711                 all_ids_count = len(video_ids)
1712                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1713                 playlistend = self._downloader.params.get('playlistend', -1)
1714
1715                 if playlistend == -1:
1716                         video_ids = video_ids[playliststart:]
1717                 else:
1718                         video_ids = video_ids[playliststart:playlistend]
1719
1720                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1721                                 (username, all_ids_count, len(video_ids)))
1722
1723                 for video_id in video_ids:
1724                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1725
1726
1727 class BlipTVUserIE(InfoExtractor):
1728         """Information Extractor for blip.tv users."""
1729
1730         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1731         _PAGE_SIZE = 12
1732         IE_NAME = u'blip.tv:user'
1733
1734         def __init__(self, downloader=None):
1735                 InfoExtractor.__init__(self, downloader)
1736
1737         def report_download_page(self, username, pagenum):
1738                 """Report attempt to download user page."""
1739                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1740                                 (self.IE_NAME, username, pagenum))
1741
1742         def _real_extract(self, url):
1743                 # Extract username
1744                 mobj = re.match(self._VALID_URL, url)
1745                 if mobj is None:
1746                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1747                         return
1748
1749                 username = mobj.group(1)
1750
1751                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1752
1753                 request = urllib2.Request(url)
1754
1755                 try:
1756                         page = urllib2.urlopen(request).read().decode('utf-8')
1757                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1758                         page_base = page_base % mobj.group(1)
1759                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1761                         return
1762
1763
1764                 # Download video ids using BlipTV Ajax calls. Result size per
1765                 # query is limited (currently to 12 videos) so we need to query
1766                 # page by page until there are no video ids - it means we got
1767                 # all of them.
1768
1769                 video_ids = []
1770                 pagenum = 1
1771
1772                 while True:
1773                         self.report_download_page(username, pagenum)
1774
1775                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1776
1777                         try:
1778                                 page = urllib2.urlopen(request).read().decode('utf-8')
1779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1780                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1781                                 return
1782
1783                         # Extract video identifiers
1784                         ids_in_page = []
1785
1786                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1787                                 if mobj.group(1) not in ids_in_page:
1788                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1789
1790                         video_ids.extend(ids_in_page)
1791
1792                         # A little optimization - if current page is not
1793                         # "full", ie. does not contain PAGE_SIZE video ids then
1794                         # we can assume that this page is the last one - there
1795                         # are no more ids on further pages - no need to query
1796                         # again.
1797
1798                         if len(ids_in_page) < self._PAGE_SIZE:
1799                                 break
1800
1801                         pagenum += 1
1802
1803                 all_ids_count = len(video_ids)
1804                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1805                 playlistend = self._downloader.params.get('playlistend', -1)
1806
1807                 if playlistend == -1:
1808                         video_ids = video_ids[playliststart:]
1809                 else:
1810                         video_ids = video_ids[playliststart:playlistend]
1811
1812                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1813                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1814
1815                 for video_id in video_ids:
1816                         self._downloader.download([u'http://blip.tv/'+video_id])
1817
1818
1819 class DepositFilesIE(InfoExtractor):
1820         """Information extractor for depositfiles.com"""
1821
1822         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1823         IE_NAME = u'DepositFiles'
1824
1825         def __init__(self, downloader=None):
1826                 InfoExtractor.__init__(self, downloader)
1827
1828         def report_download_webpage(self, file_id):
1829                 """Report webpage download."""
1830                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1831
1832         def report_extraction(self, file_id):
1833                 """Report information extraction."""
1834                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1835
1836         def _real_extract(self, url):
1837                 file_id = url.split('/')[-1]
1838                 # Rebuild url in english locale
1839                 url = 'http://depositfiles.com/en/files/' + file_id
1840
1841                 # Retrieve file webpage with 'Free download' button pressed
1842                 free_download_indication = { 'gateway_result' : '1' }
1843                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1844                 try:
1845                         self.report_download_webpage(file_id)
1846                         webpage = urllib2.urlopen(request).read()
1847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1849                         return
1850
1851                 # Search for the real file URL
1852                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1853                 if (mobj is None) or (mobj.group(1) is None):
1854                         # Try to figure out reason of the error.
1855                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1856                         if (mobj is not None) and (mobj.group(1) is not None):
1857                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1858                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1859                         else:
1860                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1861                         return
1862
1863                 file_url = mobj.group(1)
1864                 file_extension = os.path.splitext(file_url)[1][1:]
1865
1866                 # Search for file title
1867                 mobj = re.search(r'<b title="(.*?)">', webpage)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: unable to extract title')
1870                         return
1871                 file_title = mobj.group(1).decode('utf-8')
1872
1873                 return [{
1874                         'id':           file_id.decode('utf-8'),
1875                         'url':          file_url.decode('utf-8'),
1876                         'uploader':     u'NA',
1877                         'upload_date':  u'NA',
1878                         'title':        file_title,
1879                         'ext':          file_extension.decode('utf-8'),
1880                         'format':       u'NA',
1881                         'player_url':   None,
1882                 }]
1883
1884
1885 class FacebookIE(InfoExtractor):
1886         """Information Extractor for Facebook"""
1887
1888         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1889         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1890         _NETRC_MACHINE = 'facebook'
1891         _available_formats = ['video', 'highqual', 'lowqual']
1892         _video_extensions = {
1893                 'video': 'mp4',
1894                 'highqual': 'mp4',
1895                 'lowqual': 'mp4',
1896         }
1897         IE_NAME = u'facebook'
1898
1899         def __init__(self, downloader=None):
1900                 InfoExtractor.__init__(self, downloader)
1901
1902         def _reporter(self, message):
1903                 """Add header and report message."""
1904                 self._downloader.to_screen(u'[facebook] %s' % message)
1905
1906         def report_login(self):
1907                 """Report attempt to log in."""
1908                 self._reporter(u'Logging in')
1909
1910         def report_video_webpage_download(self, video_id):
1911                 """Report attempt to download video webpage."""
1912                 self._reporter(u'%s: Downloading video webpage' % video_id)
1913
1914         def report_information_extraction(self, video_id):
1915                 """Report attempt to extract video information."""
1916                 self._reporter(u'%s: Extracting video information' % video_id)
1917
1918         def _parse_page(self, video_webpage):
1919                 """Extract video information from page"""
1920                 # General data
1921                 data = {'title': r'\("video_title", "(.*?)"\)',
1922                         'description': r'<div class="datawrap">(.*?)</div>',
1923                         'owner': r'\("video_owner_name", "(.*?)"\)',
1924                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1925                         }
1926                 video_info = {}
1927                 for piece in data.keys():
1928                         mobj = re.search(data[piece], video_webpage)
1929                         if mobj is not None:
1930                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1931
1932                 # Video urls
1933                 video_urls = {}
1934                 for fmt in self._available_formats:
1935                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1936                         if mobj is not None:
1937                                 # URL is in a Javascript segment inside an escaped Unicode format within
1938                                 # the generally utf-8 page
1939                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1940                 video_info['video_urls'] = video_urls
1941
1942                 return video_info
1943
1944         def _real_initialize(self):
1945                 if self._downloader is None:
1946                         return
1947
1948                 useremail = None
1949                 password = None
1950                 downloader_params = self._downloader.params
1951
1952                 # Attempt to use provided username and password or .netrc data
1953                 if downloader_params.get('username', None) is not None:
1954                         useremail = downloader_params['username']
1955                         password = downloader_params['password']
1956                 elif downloader_params.get('usenetrc', False):
1957                         try:
1958                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1959                                 if info is not None:
1960                                         useremail = info[0]
1961                                         password = info[2]
1962                                 else:
1963                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1964                         except (IOError, netrc.NetrcParseError), err:
1965                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1966                                 return
1967
1968                 if useremail is None:
1969                         return
1970
1971                 # Log in
1972                 login_form = {
1973                         'email': useremail,
1974                         'pass': password,
1975                         'login': 'Log+In'
1976                         }
1977                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1978                 try:
1979                         self.report_login()
1980                         login_results = urllib2.urlopen(request).read()
1981                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1982                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1983                                 return
1984                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1985                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1986                         return
1987
1988         def _real_extract(self, url):
1989                 mobj = re.match(self._VALID_URL, url)
1990                 if mobj is None:
1991                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1992                         return
1993                 video_id = mobj.group('ID')
1994
1995                 # Get video webpage
1996                 self.report_video_webpage_download(video_id)
1997                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1998                 try:
1999                         page = urllib2.urlopen(request)
2000                         video_webpage = page.read()
2001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2003                         return
2004
2005                 # Start extracting information
2006                 self.report_information_extraction(video_id)
2007
2008                 # Extract information
2009                 video_info = self._parse_page(video_webpage)
2010
2011                 # uploader
2012                 if 'owner' not in video_info:
2013                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2014                         return
2015                 video_uploader = video_info['owner']
2016
2017                 # title
2018                 if 'title' not in video_info:
2019                         self._downloader.trouble(u'ERROR: unable to extract video title')
2020                         return
2021                 video_title = video_info['title']
2022                 video_title = video_title.decode('utf-8')
2023
2024                 # thumbnail image
2025                 if 'thumbnail' not in video_info:
2026                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2027                         video_thumbnail = ''
2028                 else:
2029                         video_thumbnail = video_info['thumbnail']
2030
2031                 # upload date
2032                 upload_date = u'NA'
2033                 if 'upload_date' in video_info:
2034                         upload_time = video_info['upload_date']
2035                         timetuple = email.utils.parsedate_tz(upload_time)
2036                         if timetuple is not None:
2037                                 try:
2038                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2039                                 except:
2040                                         pass
2041
2042                 # description
2043                 video_description = video_info.get('description', 'No description available.')
2044
2045                 url_map = video_info['video_urls']
2046                 if len(url_map.keys()) > 0:
2047                         # Decide which formats to download
2048                         req_format = self._downloader.params.get('format', None)
2049                         format_limit = self._downloader.params.get('format_limit', None)
2050
2051                         if format_limit is not None and format_limit in self._available_formats:
2052                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2053                         else:
2054                                 format_list = self._available_formats
2055                         existing_formats = [x for x in format_list if x in url_map]
2056                         if len(existing_formats) == 0:
2057                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2058                                 return
2059                         if req_format is None:
2060                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2061                         elif req_format == 'worst':
2062                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2063                         elif req_format == '-1':
2064                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2065                         else:
2066                                 # Specific format
2067                                 if req_format not in url_map:
2068                                         self._downloader.trouble(u'ERROR: requested format not available')
2069                                         return
2070                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2071
2072                 results = []
2073                 for format_param, video_real_url in video_url_list:
2074                         # Extension
2075                         video_extension = self._video_extensions.get(format_param, 'mp4')
2076
2077                         results.append({
2078                                 'id':           video_id.decode('utf-8'),
2079                                 'url':          video_real_url.decode('utf-8'),
2080                                 'uploader':     video_uploader.decode('utf-8'),
2081                                 'upload_date':  upload_date,
2082                                 'title':        video_title,
2083                                 'ext':          video_extension.decode('utf-8'),
2084                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2085                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2086                                 'description':  video_description.decode('utf-8'),
2087                                 'player_url':   None,
2088                         })
2089                 return results
2090
2091 class BlipTVIE(InfoExtractor):
2092         """Information extractor for blip.tv"""
2093
2094         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2095         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2096         IE_NAME = u'blip.tv'
2097
2098         def report_extraction(self, file_id):
2099                 """Report information extraction."""
2100                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2101
2102         def report_direct_download(self, title):
2103                 """Report information extraction."""
2104                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2105
2106         def _real_extract(self, url):
2107                 mobj = re.match(self._VALID_URL, url)
2108                 if mobj is None:
2109                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2110                         return
2111
2112                 if '?' in url:
2113                         cchar = '&'
2114                 else:
2115                         cchar = '?'
2116                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2117                 request = urllib2.Request(json_url.encode('utf-8'))
2118                 self.report_extraction(mobj.group(1))
2119                 info = None
2120                 try:
2121                         urlh = urllib2.urlopen(request)
2122                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2123                                 basename = url.split('/')[-1]
2124                                 title,ext = os.path.splitext(basename)
2125                                 title = title.decode('UTF-8')
2126                                 ext = ext.replace('.', '')
2127                                 self.report_direct_download(title)
2128                                 info = {
2129                                         'id': title,
2130                                         'url': url,
2131                                         'title': title,
2132                                         'ext': ext,
2133                                         'urlhandle': urlh
2134                                 }
2135                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2136                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2137                         return
2138                 if info is None: # Regular URL
2139                         try:
2140                                 json_code = urlh.read()
2141                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2143                                 return
2144
2145                         try:
2146                                 json_data = json.loads(json_code)
2147                                 if 'Post' in json_data:
2148                                         data = json_data['Post']
2149                                 else:
2150                                         data = json_data
2151
2152                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2153                                 video_url = data['media']['url']
2154                                 umobj = re.match(self._URL_EXT, video_url)
2155                                 if umobj is None:
2156                                         raise ValueError('Can not determine filename extension')
2157                                 ext = umobj.group(1)
2158
2159                                 info = {
2160                                         'id': data['item_id'],
2161                                         'url': video_url,
2162                                         'uploader': data['display_name'],
2163                                         'upload_date': upload_date,
2164                                         'title': data['title'],
2165                                         'ext': ext,
2166                                         'format': data['media']['mimeType'],
2167                                         'thumbnail': data['thumbnailUrl'],
2168                                         'description': data['description'],
2169                                         'player_url': data['embedUrl']
2170                                 }
2171                         except (ValueError,KeyError), err:
2172                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2173                                 return
2174
2175                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2176                 return [info]
2177
2178
2179 class MyVideoIE(InfoExtractor):
2180         """Information Extractor for myvideo.de."""
2181
2182         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2183         IE_NAME = u'myvideo'
2184
2185         def __init__(self, downloader=None):
2186                 InfoExtractor.__init__(self, downloader)
2187
2188         def report_download_webpage(self, video_id):
2189                 """Report webpage download."""
2190                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2191
2192         def report_extraction(self, video_id):
2193                 """Report information extraction."""
2194                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2195
2196         def _real_extract(self,url):
2197                 mobj = re.match(self._VALID_URL, url)
2198                 if mobj is None:
2199                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2200                         return
2201
2202                 video_id = mobj.group(1)
2203
2204                 # Get video webpage
2205                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2206                 try:
2207                         self.report_download_webpage(video_id)
2208                         webpage = urllib2.urlopen(request).read()
2209                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2210                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2211                         return
2212
2213                 self.report_extraction(video_id)
2214                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2215                                  webpage)
2216                 if mobj is None:
2217                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2218                         return
2219                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2220
2221                 mobj = re.search('<title>([^<]+)</title>', webpage)
2222                 if mobj is None:
2223                         self._downloader.trouble(u'ERROR: unable to extract title')
2224                         return
2225
2226                 video_title = mobj.group(1)
2227
2228                 return [{
2229                         'id':           video_id,
2230                         'url':          video_url,
2231                         'uploader':     u'NA',
2232                         'upload_date':  u'NA',
2233                         'title':        video_title,
2234                         'ext':          u'flv',
2235                         'format':       u'NA',
2236                         'player_url':   None,
2237                 }]
2238
2239 class ComedyCentralIE(InfoExtractor):
2240         """Information extractor for The Daily Show and Colbert Report """
2241
2242         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2243         IE_NAME = u'comedycentral'
2244
2245         def report_extraction(self, episode_id):
2246                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2247
2248         def report_config_download(self, episode_id):
2249                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2250
2251         def report_index_download(self, episode_id):
2252                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2253
2254         def report_player_url(self, episode_id):
2255                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2256
2257         def _real_extract(self, url):
2258                 mobj = re.match(self._VALID_URL, url)
2259                 if mobj is None:
2260                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2261                         return
2262
2263                 if mobj.group('shortname'):
2264                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2265                                 url = u'http://www.thedailyshow.com/full-episodes/'
2266                         else:
2267                                 url = u'http://www.colbertnation.com/full-episodes/'
2268                         mobj = re.match(self._VALID_URL, url)
2269                         assert mobj is not None
2270
2271                 dlNewest = not mobj.group('episode')
2272                 if dlNewest:
2273                         epTitle = mobj.group('showname')
2274                 else:
2275                         epTitle = mobj.group('episode')
2276
2277                 req = urllib2.Request(url)
2278                 self.report_extraction(epTitle)
2279                 try:
2280                         htmlHandle = urllib2.urlopen(req)
2281                         html = htmlHandle.read()
2282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2283                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2284                         return
2285                 if dlNewest:
2286                         url = htmlHandle.geturl()
2287                         mobj = re.match(self._VALID_URL, url)
2288                         if mobj is None:
2289                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2290                                 return
2291                         if mobj.group('episode') == '':
2292                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2293                                 return
2294                         epTitle = mobj.group('episode')
2295
2296                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2297                 if len(mMovieParams) == 0:
2298                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2299                         return
2300
2301                 playerUrl_raw = mMovieParams[0][0]
2302                 self.report_player_url(epTitle)
2303                 try:
2304                         urlHandle = urllib2.urlopen(playerUrl_raw)
2305                         playerUrl = urlHandle.geturl()
2306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2308                         return
2309
2310                 uri = mMovieParams[0][1]
2311                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2312                 self.report_index_download(epTitle)
2313                 try:
2314                         indexXml = urllib2.urlopen(indexUrl).read()
2315                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2317                         return
2318
2319                 results = []
2320
2321                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2322                 itemEls = idoc.findall('.//item')
2323                 for itemEl in itemEls:
2324                         mediaId = itemEl.findall('./guid')[0].text
2325                         shortMediaId = mediaId.split(':')[-1]
2326                         showId = mediaId.split(':')[-2].replace('.com', '')
2327                         officialTitle = itemEl.findall('./title')[0].text
2328                         officialDate = itemEl.findall('./pubDate')[0].text
2329
2330                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2331                                                 urllib.urlencode({'uri': mediaId}))
2332                         configReq = urllib2.Request(configUrl)
2333                         self.report_config_download(epTitle)
2334                         try:
2335                                 configXml = urllib2.urlopen(configReq).read()
2336                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2337                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2338                                 return
2339
2340                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2341                         turls = []
2342                         for rendition in cdoc.findall('.//rendition'):
2343                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2344                                 turls.append(finfo)
2345
2346                         if len(turls) == 0:
2347                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2348                                 continue
2349
2350                         # For now, just pick the highest bitrate
2351                         format,video_url = turls[-1]
2352
2353                         effTitle = showId + u'-' + epTitle
2354                         info = {
2355                                 'id': shortMediaId,
2356                                 'url': video_url,
2357                                 'uploader': showId,
2358                                 'upload_date': officialDate,
2359                                 'title': effTitle,
2360                                 'ext': 'mp4',
2361                                 'format': format,
2362                                 'thumbnail': None,
2363                                 'description': officialTitle,
2364                                 'player_url': playerUrl
2365                         }
2366
2367                         results.append(info)
2368
2369                 return results
2370
2371
2372 class EscapistIE(InfoExtractor):
2373         """Information extractor for The Escapist """
2374
2375         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2376         IE_NAME = u'escapist'
2377
2378         def report_extraction(self, showName):
2379                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2380
2381         def report_config_download(self, showName):
2382                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2383
2384         def _real_extract(self, url):
2385                 mobj = re.match(self._VALID_URL, url)
2386                 if mobj is None:
2387                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2388                         return
2389                 showName = mobj.group('showname')
2390                 videoId = mobj.group('episode')
2391
2392                 self.report_extraction(showName)
2393                 try:
2394                         webPage = urllib2.urlopen(url)
2395                         webPageBytes = webPage.read()
2396                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2397                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2398                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2399                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2400                         return
2401
2402                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2403                 description = unescapeHTML(descMatch.group(1))
2404                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2405                 imgUrl = unescapeHTML(imgMatch.group(1))
2406                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2407                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2408                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2409                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2410
2411                 self.report_config_download(showName)
2412                 try:
2413                         configJSON = urllib2.urlopen(configUrl).read()
2414                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2415                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2416                         return
2417
2418                 # Technically, it's JavaScript, not JSON
2419                 configJSON = configJSON.replace("'", '"')
2420
2421                 try:
2422                         config = json.loads(configJSON)
2423                 except (ValueError,), err:
2424                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2425                         return
2426
2427                 playlist = config['playlist']
2428                 videoUrl = playlist[1]['url']
2429
2430                 info = {
2431                         'id': videoId,
2432                         'url': videoUrl,
2433                         'uploader': showName,
2434                         'upload_date': None,
2435                         'title': showName,
2436                         'ext': 'flv',
2437                         'format': 'flv',
2438                         'thumbnail': imgUrl,
2439                         'description': description,
2440                         'player_url': playerUrl,
2441                 }
2442
2443                 return [info]
2444
2445
2446 class CollegeHumorIE(InfoExtractor):
2447         """Information extractor for collegehumor.com"""
2448
2449         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2450         IE_NAME = u'collegehumor'
2451
2452         def report_webpage(self, video_id):
2453                 """Report information extraction."""
2454                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2455
2456         def report_extraction(self, video_id):
2457                 """Report information extraction."""
2458                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2459
2460         def _real_extract(self, url):
2461                 mobj = re.match(self._VALID_URL, url)
2462                 if mobj is None:
2463                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2464                         return
2465                 video_id = mobj.group('videoid')
2466
2467                 self.report_webpage(video_id)
2468                 request = urllib2.Request(url)
2469                 try:
2470                         webpage = urllib2.urlopen(request).read()
2471                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2472                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2473                         return
2474
2475                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2476                 if m is None:
2477                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2478                         return
2479                 internal_video_id = m.group('internalvideoid')
2480
2481                 info = {
2482                         'id': video_id,
2483                         'internal_id': internal_video_id,
2484                 }
2485
2486                 self.report_extraction(video_id)
2487                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2488                 try:
2489                         metaXml = urllib2.urlopen(xmlUrl).read()
2490                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2491                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2492                         return
2493
2494                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2495                 try:
2496                         videoNode = mdoc.findall('./video')[0]
2497                         info['description'] = videoNode.findall('./description')[0].text
2498                         info['title'] = videoNode.findall('./caption')[0].text
2499                         info['url'] = videoNode.findall('./file')[0].text
2500                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2501                         info['ext'] = info['url'].rpartition('.')[2]
2502                         info['format'] = info['ext']
2503                 except IndexError:
2504                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2505                         return
2506
2507                 return [info]
2508
2509
2510 class XVideosIE(InfoExtractor):
2511         """Information extractor for xvideos.com"""
2512
2513         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2514         IE_NAME = u'xvideos'
2515
2516         def report_webpage(self, video_id):
2517                 """Report information extraction."""
2518                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519
2520         def report_extraction(self, video_id):
2521                 """Report information extraction."""
2522                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523
2524         def _real_extract(self, url):
2525                 mobj = re.match(self._VALID_URL, url)
2526                 if mobj is None:
2527                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2528                         return
2529                 video_id = mobj.group(1).decode('utf-8')
2530
2531                 self.report_webpage(video_id)
2532
2533                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2534                 try:
2535                         webpage = urllib2.urlopen(request).read()
2536                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2537                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2538                         return
2539
2540                 self.report_extraction(video_id)
2541
2542
2543                 # Extract video URL
2544                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2545                 if mobj is None:
2546                         self._downloader.trouble(u'ERROR: unable to extract video url')
2547                         return
2548                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2549
2550
2551                 # Extract title
2552                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2553                 if mobj is None:
2554                         self._downloader.trouble(u'ERROR: unable to extract video title')
2555                         return
2556                 video_title = mobj.group(1).decode('utf-8')
2557
2558
2559                 # Extract video thumbnail
2560                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2561                 if mobj is None:
2562                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2563                         return
2564                 video_thumbnail = mobj.group(0).decode('utf-8')
2565
2566                 info = {
2567                         'id': video_id,
2568                         'url': video_url,
2569                         'uploader': None,
2570                         'upload_date': None,
2571                         'title': video_title,
2572                         'ext': 'flv',
2573                         'format': 'flv',
2574                         'thumbnail': video_thumbnail,
2575                         'description': None,
2576                         'player_url': None,
2577                 }
2578
2579                 return [info]
2580
2581
2582 class SoundcloudIE(InfoExtractor):
2583         """Information extractor for soundcloud.com
2584            To access the media, the uid of the song and a stream token
2585            must be extracted from the page source and the script must make
2586            a request to media.soundcloud.com/crossdomain.xml. Then
2587            the media can be grabbed by requesting from an url composed
2588            of the stream token and uid
2589          """
2590
2591         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2592         IE_NAME = u'soundcloud'
2593
2594         def __init__(self, downloader=None):
2595                 InfoExtractor.__init__(self, downloader)
2596
2597         def report_webpage(self, video_id):
2598                 """Report information extraction."""
2599                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2600
2601         def report_extraction(self, video_id):
2602                 """Report information extraction."""
2603                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2604
2605         def _real_extract(self, url):
2606                 mobj = re.match(self._VALID_URL, url)
2607                 if mobj is None:
2608                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2609                         return
2610
2611                 # extract uploader (which is in the url)
2612                 uploader = mobj.group(1).decode('utf-8')
2613                 # extract simple title (uploader + slug of song title)
2614                 slug_title =  mobj.group(2).decode('utf-8')
2615                 simple_title = uploader + u'-' + slug_title
2616
2617                 self.report_webpage('%s/%s' % (uploader, slug_title))
2618
2619                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2620                 try:
2621                         webpage = urllib2.urlopen(request).read()
2622                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2624                         return
2625
2626                 self.report_extraction('%s/%s' % (uploader, slug_title))
2627
2628                 # extract uid and stream token that soundcloud hands out for access
2629                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2630                 if mobj:
2631                         video_id = mobj.group(1)
2632                         stream_token = mobj.group(2)
2633
2634                 # extract unsimplified title
2635                 mobj = re.search('"title":"(.*?)",', webpage)
2636                 if mobj:
2637                         title = mobj.group(1).decode('utf-8')
2638                 else:
2639                         title = simple_title
2640
2641                 # construct media url (with uid/token)
2642                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2643                 mediaURL = mediaURL % (video_id, stream_token)
2644
2645                 # description
2646                 description = u'No description available'
2647                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2648                 if mobj:
2649                         description = mobj.group(1)
2650
2651                 # upload date
2652                 upload_date = None
2653                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2654                 if mobj:
2655                         try:
2656                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2657                         except Exception, e:
2658                                 self._downloader.to_stderr(str(e))
2659
2660                 # for soundcloud, a request to a cross domain is required for cookies
2661                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2662
2663                 return [{
2664                         'id':           video_id.decode('utf-8'),
2665                         'url':          mediaURL,
2666                         'uploader':     uploader.decode('utf-8'),
2667                         'upload_date':  upload_date,
2668                         'title':        title,
2669                         'ext':          u'mp3',
2670                         'format':       u'NA',
2671                         'player_url':   None,
2672                         'description': description.decode('utf-8')
2673                 }]
2674
2675
2676 class InfoQIE(InfoExtractor):
2677         """Information extractor for infoq.com"""
2678
2679         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2680         IE_NAME = u'infoq'
2681
2682         def report_webpage(self, video_id):
2683                 """Report information extraction."""
2684                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2685
2686         def report_extraction(self, video_id):
2687                 """Report information extraction."""
2688                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2689
2690         def _real_extract(self, url):
2691                 mobj = re.match(self._VALID_URL, url)
2692                 if mobj is None:
2693                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2694                         return
2695
2696                 self.report_webpage(url)
2697
2698                 request = urllib2.Request(url)
2699                 try:
2700                         webpage = urllib2.urlopen(request).read()
2701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2702                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2703                         return
2704
2705                 self.report_extraction(url)
2706
2707
2708                 # Extract video URL
2709                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2710                 if mobj is None:
2711                         self._downloader.trouble(u'ERROR: unable to extract video url')
2712                         return
2713                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2714
2715
2716                 # Extract title
2717                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2718                 if mobj is None:
2719                         self._downloader.trouble(u'ERROR: unable to extract video title')
2720                         return
2721                 video_title = mobj.group(1).decode('utf-8')
2722
2723                 # Extract description
2724                 video_description = u'No description available.'
2725                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2726                 if mobj is not None:
2727                         video_description = mobj.group(1).decode('utf-8')
2728
2729                 video_filename = video_url.split('/')[-1]
2730                 video_id, extension = video_filename.split('.')
2731
2732                 info = {
2733                         'id': video_id,
2734                         'url': video_url,
2735                         'uploader': None,
2736                         'upload_date': None,
2737                         'title': video_title,
2738                         'ext': extension,
2739                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2740                         'thumbnail': None,
2741                         'description': video_description,
2742                         'player_url': None,
2743                 }
2744
2745                 return [info]
2746
2747 class MixcloudIE(InfoExtractor):
2748         """Information extractor for www.mixcloud.com"""
2749         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2750         IE_NAME = u'mixcloud'
2751
2752         def __init__(self, downloader=None):
2753                 InfoExtractor.__init__(self, downloader)
2754
2755         def report_download_json(self, file_id):
2756                 """Report JSON download."""
2757                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2758
2759         def report_extraction(self, file_id):
2760                 """Report information extraction."""
2761                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2762
2763         def get_urls(self, jsonData, fmt, bitrate='best'):
2764                 """Get urls from 'audio_formats' section in json"""
2765                 file_url = None
2766                 try:
2767                         bitrate_list = jsonData[fmt]
2768                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2769                                 bitrate = max(bitrate_list) # select highest
2770
2771                         url_list = jsonData[fmt][bitrate]
2772                 except TypeError: # we have no bitrate info.
2773                         url_list = jsonData[fmt]
2774                 return url_list
2775
2776         def check_urls(self, url_list):
2777                 """Returns 1st active url from list"""
2778                 for url in url_list:
2779                         try:
2780                                 urllib2.urlopen(url)
2781                                 return url
2782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783                                 url = None
2784
2785                 return None
2786
2787         def _print_formats(self, formats):
2788                 print 'Available formats:'
2789                 for fmt in formats.keys():
2790                         for b in formats[fmt]:
2791                                 try:
2792                                         ext = formats[fmt][b][0]
2793                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2794                                 except TypeError: # we have no bitrate info
2795                                         ext = formats[fmt][0]
2796                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2797                                         break
2798
2799         def _real_extract(self, url):
2800                 mobj = re.match(self._VALID_URL, url)
2801                 if mobj is None:
2802                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2803                         return
2804                 # extract uploader & filename from url
2805                 uploader = mobj.group(1).decode('utf-8')
2806                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2807
2808                 # construct API request
2809                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2810                 # retrieve .json file with links to files
2811                 request = urllib2.Request(file_url)
2812                 try:
2813                         self.report_download_json(file_url)
2814                         jsonData = urllib2.urlopen(request).read()
2815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2817                         return
2818
2819                 # parse JSON
2820                 json_data = json.loads(jsonData)
2821                 player_url = json_data['player_swf_url']
2822                 formats = dict(json_data['audio_formats'])
2823
2824                 req_format = self._downloader.params.get('format', None)
2825                 bitrate = None
2826
2827                 if self._downloader.params.get('listformats', None):
2828                         self._print_formats(formats)
2829                         return
2830
2831                 if req_format is None or req_format == 'best':
2832                         for format_param in formats.keys():
2833                                 url_list = self.get_urls(formats, format_param)
2834                                 # check urls
2835                                 file_url = self.check_urls(url_list)
2836                                 if file_url is not None:
2837                                         break # got it!
2838                 else:
2839                         if req_format not in formats.keys():
2840                                 self._downloader.trouble(u'ERROR: format is not available')
2841                                 return
2842
2843                         url_list = self.get_urls(formats, req_format)
2844                         file_url = self.check_urls(url_list)
2845                         format_param = req_format
2846
2847                 return [{
2848                         'id': file_id.decode('utf-8'),
2849                         'url': file_url.decode('utf-8'),
2850                         'uploader':     uploader.decode('utf-8'),
2851                         'upload_date': u'NA',
2852                         'title': json_data['name'],
2853                         'ext': file_url.split('.')[-1].decode('utf-8'),
2854                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2855                         'thumbnail': json_data['thumbnail_url'],
2856                         'description': json_data['description'],
2857                         'player_url': player_url.decode('utf-8'),
2858                 }]
2859
2860 class StanfordOpenClassroomIE(InfoExtractor):
2861         """Information extractor for Stanford's Open ClassRoom"""
2862
2863         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2864         IE_NAME = u'stanfordoc'
2865
2866         def report_download_webpage(self, objid):
2867                 """Report information extraction."""
2868                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2869
2870         def report_extraction(self, video_id):
2871                 """Report information extraction."""
2872                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2873
2874         def _real_extract(self, url):
2875                 mobj = re.match(self._VALID_URL, url)
2876                 if mobj is None:
2877                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2878                         return
2879
2880                 if mobj.group('course') and mobj.group('video'): # A specific video
2881                         course = mobj.group('course')
2882                         video = mobj.group('video')
2883                         info = {
2884                                 'id': course + '_' + video,
2885                         }
2886
2887                         self.report_extraction(info['id'])
2888                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2889                         xmlUrl = baseUrl + video + '.xml'
2890                         try:
2891                                 metaXml = urllib2.urlopen(xmlUrl).read()
2892                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2893                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2894                                 return
2895                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2896                         try:
2897                                 info['title'] = mdoc.findall('./title')[0].text
2898                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2899                         except IndexError:
2900                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2901                                 return
2902                         info['ext'] = info['url'].rpartition('.')[2]
2903                         info['format'] = info['ext']
2904                         return [info]
2905                 elif mobj.group('course'): # A course page
2906                         course = mobj.group('course')
2907                         info = {
2908                                 'id': course,
2909                                 'type': 'playlist',
2910                         }
2911
2912                         self.report_download_webpage(info['id'])
2913                         try:
2914                                 coursepage = urllib2.urlopen(url).read()
2915                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2917                                 return
2918
2919                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2920                         if m:
2921                                 info['title'] = unescapeHTML(m.group(1))
2922                         else:
2923                                 info['title'] = info['id']
2924
2925                         m = re.search('<description>([^<]+)</description>', coursepage)
2926                         if m:
2927                                 info['description'] = unescapeHTML(m.group(1))
2928
2929                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2930                         info['list'] = [
2931                                 {
2932                                         'type': 'reference',
2933                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2934                                 }
2935                                         for vpage in links]
2936                         results = []
2937                         for entry in info['list']:
2938                                 assert entry['type'] == 'reference'
2939                                 results += self.extract(entry['url'])
2940                         return results
2941
2942                 else: # Root page
2943                         info = {
2944                                 'id': 'Stanford OpenClassroom',
2945                                 'type': 'playlist',
2946                         }
2947
2948                         self.report_download_webpage(info['id'])
2949                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2950                         try:
2951                                 rootpage = urllib2.urlopen(rootURL).read()
2952                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2953                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2954                                 return
2955
2956                         info['title'] = info['id']
2957
2958                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2959                         info['list'] = [
2960                                 {
2961                                         'type': 'reference',
2962                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2963                                 }
2964                                         for cpage in links]
2965
2966                         results = []
2967                         for entry in info['list']:
2968                                 assert entry['type'] == 'reference'
2969                                 results += self.extract(entry['url'])
2970                         return results
2971
2972 class MTVIE(InfoExtractor):
2973         """Information extractor for MTV.com"""
2974
2975         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2976         IE_NAME = u'mtv'
2977
2978         def report_webpage(self, video_id):
2979                 """Report information extraction."""
2980                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2981
2982         def report_extraction(self, video_id):
2983                 """Report information extraction."""
2984                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2985
2986         def _real_extract(self, url):
2987                 mobj = re.match(self._VALID_URL, url)
2988                 if mobj is None:
2989                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2990                         return
2991                 if not mobj.group('proto'):
2992                         url = 'http://' + url
2993                 video_id = mobj.group('videoid')
2994                 self.report_webpage(video_id)
2995
2996                 request = urllib2.Request(url)
2997                 try:
2998                         webpage = urllib2.urlopen(request).read()
2999                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3000                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3001                         return
3002
3003                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3004                 if mobj is None:
3005                         self._downloader.trouble(u'ERROR: unable to extract song name')
3006                         return
3007                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3008                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3009                 if mobj is None:
3010                         self._downloader.trouble(u'ERROR: unable to extract performer')
3011                         return
3012                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3013                 video_title = performer + ' - ' + song_name
3014
3015                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3016                 if mobj is None:
3017                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3018                         return
3019                 mtvn_uri = mobj.group(1)
3020
3021                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3022                 if mobj is None:
3023                         self._downloader.trouble(u'ERROR: unable to extract content id')
3024                         return
3025                 content_id = mobj.group(1)
3026
3027                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3028                 self.report_extraction(video_id)
3029                 request = urllib2.Request(videogen_url)
3030                 try:
3031                         metadataXml = urllib2.urlopen(request).read()
3032                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3033                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3034                         return
3035
3036                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3037                 renditions = mdoc.findall('.//rendition')
3038
3039                 # For now, always pick the highest quality.
3040                 rendition = renditions[-1]
3041
3042                 try:
3043                         _,_,ext = rendition.attrib['type'].partition('/')
3044                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3045                         video_url = rendition.find('./src').text
3046                 except KeyError:
3047                         self._downloader.trouble('Invalid rendition field.')
3048                         return
3049
3050                 info = {
3051                         'id': video_id,
3052                         'url': video_url,
3053                         'uploader': performer,
3054                         'title': video_title,
3055                         'ext': ext,
3056                         'format': format,
3057                 }
3058
3059                 return [info]
3060
3061
3062 class YoukuIE(InfoExtractor):
3063
3064         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3065         IE_NAME = u'Youku'
3066
3067         def __init__(self, downloader=None):
3068                 InfoExtractor.__init__(self, downloader)
3069
3070         def report_download_webpage(self, file_id):
3071                 """Report webpage download."""
3072                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3073
3074         def report_extraction(self, file_id):
3075                 """Report information extraction."""
3076                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3077
3078         def _gen_sid(self):
3079                 nowTime = int(time.time() * 1000)
3080                 random1 = random.randint(1000,1998)
3081                 random2 = random.randint(1000,9999)
3082
3083                 return "%d%d%d" %(nowTime,random1,random2)
3084
3085         def _get_file_ID_mix_string(self, seed):
3086                 mixed = []
3087                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3088                 seed = float(seed)
3089                 for i in range(len(source)):
3090                         seed  =  (seed * 211 + 30031 ) % 65536
3091                         index  =  math.floor(seed / 65536 * len(source) )
3092                         mixed.append(source[int(index)])
3093                         source.remove(source[int(index)])
3094                 #return ''.join(mixed)
3095                 return mixed
3096
3097         def _get_file_id(self, fileId, seed):
3098                 mixed = self._get_file_ID_mix_string(seed)
3099                 ids = fileId.split('*')
3100                 realId = []
3101                 for ch in ids:
3102                         if ch:
3103                                 realId.append(mixed[int(ch)])
3104                 return ''.join(realId)
3105
3106         def _real_extract(self, url):
3107                 mobj = re.match(self._VALID_URL, url)
3108                 if mobj is None:
3109                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3110                         return
3111                 video_id = mobj.group('ID')
3112
3113                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3114
3115                 request = urllib2.Request(info_url, None, std_headers)
3116                 try:
3117                         self.report_download_webpage(video_id)
3118                         jsondata = urllib2.urlopen(request).read()
3119                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3120                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3121                         return
3122
3123                 self.report_extraction(video_id)
3124                 try:
3125                         config = json.loads(jsondata)
3126
3127                         video_title =  config['data'][0]['title']
3128                         seed = config['data'][0]['seed']
3129
3130                         format = self._downloader.params.get('format', None)
3131                         supported_format = config['data'][0]['streamfileids'].keys()
3132
3133                         if format is None or format == 'best':
3134                                 if 'hd2' in supported_format:
3135                                         format = 'hd2'
3136                                 else:
3137                                         format = 'flv'
3138                                 ext = u'flv'
3139                         elif format == 'worst':
3140                                 format = 'mp4'
3141                                 ext = u'mp4'
3142                         else:
3143                                 format = 'flv'
3144                                 ext = u'flv'
3145
3146
3147                         fileid = config['data'][0]['streamfileids'][format]
3148                         seg_number = len(config['data'][0]['segs'][format])
3149
3150                         keys=[]
3151                         for i in xrange(seg_number):
3152                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3153
3154                         #TODO check error
3155                         #youku only could be viewed from mainland china
3156                 except:
3157                         self._downloader.trouble(u'ERROR: unable to extract info section')
3158                         return
3159
3160                 files_info=[]
3161                 sid = self._gen_sid()
3162                 fileid = self._get_file_id(fileid, seed)
3163
3164                 #column 8,9 of fileid represent the segment number
3165                 #fileid[7:9] should be changed
3166                 for index, key in enumerate(keys):
3167
3168                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3169                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3170
3171                         info = {
3172                                 'id': '%s_part%02d' % (video_id, index),
3173                                 'url': download_url,
3174                                 'uploader': None,
3175                                 'title': video_title,
3176                                 'ext': ext,
3177                                 'format': u'NA'
3178                         }
3179                         files_info.append(info)
3180
3181                 return files_info
3182
3183
3184 class XNXXIE(InfoExtractor):
3185         """Information extractor for xnxx.com"""
3186
3187         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3188         IE_NAME = u'xnxx'
3189         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3190         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3191         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3192
3193         def report_webpage(self, video_id):
3194                 """Report information extraction"""
3195                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3196
3197         def report_extraction(self, video_id):
3198                 """Report information extraction"""
3199                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3200
3201         def _real_extract(self, url):
3202                 mobj = re.match(self._VALID_URL, url)
3203                 if mobj is None:
3204                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3205                         return
3206                 video_id = mobj.group(1).decode('utf-8')
3207
3208                 self.report_webpage(video_id)
3209
3210                 # Get webpage content
3211                 try:
3212                         webpage = urllib2.urlopen(url).read()
3213                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3214                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3215                         return
3216
3217                 result = re.search(self.VIDEO_URL_RE, webpage)
3218                 if result is None:
3219                         self._downloader.trouble(u'ERROR: unable to extract video url')
3220                         return
3221                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3222
3223                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3224                 if result is None:
3225                         self._downloader.trouble(u'ERROR: unable to extract video title')
3226                         return
3227                 video_title = result.group(1).decode('utf-8')
3228
3229                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3230                 if result is None:
3231                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3232                         return
3233                 video_thumbnail = result.group(1).decode('utf-8')
3234
3235                 info = {'id': video_id,
3236                                 'url': video_url,
3237                                 'uploader': None,
3238                                 'upload_date': None,
3239                                 'title': video_title,
3240                                 'ext': 'flv',
3241                                 'format': 'flv',
3242                                 'thumbnail': video_thumbnail,
3243                                 'description': None,
3244                                 'player_url': None}
3245
3246                 return [info]
3247
3248
3249 class GooglePlusIE(InfoExtractor):
3250         """Information extractor for plus.google.com."""
3251
3252         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3253         IE_NAME = u'plus.google'
3254
3255         def __init__(self, downloader=None):
3256                 InfoExtractor.__init__(self, downloader)
3257
3258         def report_extract_entry(self, url):
3259                 """Report downloading extry"""
3260                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3261
3262         def report_date(self, upload_date):
3263                 """Report downloading extry"""
3264                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3265
3266         def report_uploader(self, uploader):
3267                 """Report downloading extry"""
3268                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3269
3270         def report_title(self, video_title):
3271                 """Report downloading extry"""
3272                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3273
3274         def report_extract_vid_page(self, video_page):
3275                 """Report information extraction."""
3276                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3277
3278         def _real_extract(self, url):
3279                 # Extract id from URL
3280                 mobj = re.match(self._VALID_URL, url)
3281                 if mobj is None:
3282                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3283                         return
3284
3285                 post_url = mobj.group(0)
3286                 video_id = mobj.group(2)
3287
3288                 video_extension = 'flv'
3289
3290                 # Step 1, Retrieve post webpage to extract further information
3291                 self.report_extract_entry(post_url)
3292                 request = urllib2.Request(post_url)
3293                 try:
3294                         webpage = urllib2.urlopen(request).read()
3295                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3296                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3297                         return
3298
3299                 # Extract update date
3300                 upload_date = u'NA'
3301                 pattern = 'title="Timestamp">(.*?)</a>'
3302                 mobj = re.search(pattern, webpage)
3303                 if mobj:
3304                         upload_date = mobj.group(1)
3305                         # Convert timestring to a format suitable for filename
3306                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3307                         upload_date = upload_date.strftime('%Y%m%d')
3308                 self.report_date(upload_date)
3309
3310                 # Extract uploader
3311                 uploader = u'NA'
3312                 pattern = r'rel\="author".*?>(.*?)</a>'
3313                 mobj = re.search(pattern, webpage)
3314                 if mobj:
3315                         uploader = mobj.group(1)
3316                 self.report_uploader(uploader)
3317
3318                 # Extract title
3319                 # Get the first line for title
3320                 video_title = u'NA'
3321                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3322                 mobj = re.search(pattern, webpage)
3323                 if mobj:
3324                         video_title = mobj.group(1)
3325                 self.report_title(video_title)
3326
3327                 # Step 2, Stimulate clicking the image box to launch video
3328                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3329                 mobj = re.search(pattern, webpage)
3330                 if mobj is None:
3331                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3332
3333                 video_page = mobj.group(1)
3334                 request = urllib2.Request(video_page)
3335                 try:
3336                         webpage = urllib2.urlopen(request).read()
3337                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3338                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3339                         return
3340                 self.report_extract_vid_page(video_page)
3341
3342
3343                 # Extract video links on video page
3344                 """Extract video links of all sizes"""
3345                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3346                 mobj = re.findall(pattern, webpage)
3347                 if len(mobj) == 0:
3348                         self._downloader.trouble(u'ERROR: unable to extract video links')
3349
3350                 # Sort in resolution
3351                 links = sorted(mobj)
3352
3353                 # Choose the lowest of the sort, i.e. highest resolution
3354                 video_url = links[-1]
3355                 # Only get the url. The resolution part in the tuple has no use anymore
3356                 video_url = video_url[-1]
3357                 # Treat escaped \u0026 style hex
3358                 video_url = unicode(video_url, "unicode_escape")
3359
3360
3361                 return [{
3362                         'id':           video_id.decode('utf-8'),
3363                         'url':          video_url,
3364                         'uploader':     uploader.decode('utf-8'),
3365                         'upload_date':  upload_date.decode('utf-8'),
3366                         'title':        video_title.decode('utf-8'),
3367                         'ext':          video_extension.decode('utf-8'),
3368                         'format':       u'NA',
3369                         'player_url':   None,
3370                 }]