youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24 from .extractor.common import InfoExtractor, SearchInfoExtractor
  25
  26 from .extractor.ard import ARDIE
  27 from .extractor.arte import ArteTvIE
  28 from .extractor.dailymotion import DailymotionIE
  29 from .extractor.gametrailers import GametrailersIE
  30 from .extractor.generic import GenericIE
  31 from .extractor.metacafe import MetacafeIE
  32 from .extractor.statigram import StatigramIE
  33 from .extractor.photobucket import PhotobucketIE
  34 from .extractor.vimeo import VimeoIE
  35 from .extractor.yahoo import YahooIE
  36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  37 from .extractor.zdf import ZDFIE
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52 class GoogleSearchIE(SearchInfoExtractor):
  53     """Information Extractor for Google Video search queries."""
  54     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
  55     _MAX_RESULTS = 1000
  56     IE_NAME = u'video.google:search'
  57     _SEARCH_KEY = 'gvsearch'
  58
  59     def _get_n_results(self, query, n):
  60         """Get a specified number of results for a query"""
  61
  62         res = {
  63             '_type': 'playlist',
  64             'id': query,
  65             'entries': []
  66         }
  67
  68         for pagenum in itertools.count(1):
  69             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
  70             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
  71                                              note='Downloading result page ' + str(pagenum))
  72
  73             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
  74                 e = {
  75                     '_type': 'url',
  76                     'url': mobj.group(1)
  77                 }
  78                 res['entries'].append(e)
  79
  80             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
  81                 return res
  82
  83 class YahooSearchIE(SearchInfoExtractor):
  84     """Information Extractor for Yahoo! Video search queries."""
  85
  86     _MAX_RESULTS = 1000
  87     IE_NAME = u'screen.yahoo:search'
  88     _SEARCH_KEY = 'yvsearch'
  89
  90     def _get_n_results(self, query, n):
  91         """Get a specified number of results for a query"""
  92
  93         res = {
  94             '_type': 'playlist',
  95             'id': query,
  96             'entries': []
  97         }
  98         for pagenum in itertools.count(0):
  99             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 100             webpage = self._download_webpage(result_url, query,
 101                                              note='Downloading results page '+str(pagenum+1))
 102             info = json.loads(webpage)
 103             m = info[u'm']
 104             results = info[u'results']
 105
 106             for (i, r) in enumerate(results):
 107                 if (pagenum * 30) +i >= n:
 108                     break
 109                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 110                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 111                 res['entries'].append(e)
 112             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 113                 break
 114
 115         return res
 116
 117
 118 class BlipTVUserIE(InfoExtractor):
 119     """Information Extractor for blip.tv users."""
 120
 121     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 122     _PAGE_SIZE = 12
 123     IE_NAME = u'blip.tv:user'
 124
 125     def _real_extract(self, url):
 126         # Extract username
 127         mobj = re.match(self._VALID_URL, url)
 128         if mobj is None:
 129             raise ExtractorError(u'Invalid URL: %s' % url)
 130
 131         username = mobj.group(1)
 132
 133         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 134
 135         page = self._download_webpage(url, username, u'Downloading user page')
 136         mobj = re.search(r'data-users-id="([^"]+)"', page)
 137         page_base = page_base % mobj.group(1)
 138
 139
 140         # Download video ids using BlipTV Ajax calls. Result size per
 141         # query is limited (currently to 12 videos) so we need to query
 142         # page by page until there are no video ids - it means we got
 143         # all of them.
 144
 145         video_ids = []
 146         pagenum = 1
 147
 148         while True:
 149             url = page_base + "&page=" + str(pagenum)
 150             page = self._download_webpage(url, username,
 151                                           u'Downloading video ids from page %d' % pagenum)
 152
 153             # Extract video identifiers
 154             ids_in_page = []
 155
 156             for mobj in re.finditer(r'href="/([^"]+)"', page):
 157                 if mobj.group(1) not in ids_in_page:
 158                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 159
 160             video_ids.extend(ids_in_page)
 161
 162             # A little optimization - if current page is not
 163             # "full", ie. does not contain PAGE_SIZE video ids then
 164             # we can assume that this page is the last one - there
 165             # are no more ids on further pages - no need to query
 166             # again.
 167
 168             if len(ids_in_page) < self._PAGE_SIZE:
 169                 break
 170
 171             pagenum += 1
 172
 173         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 174         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 175         return [self.playlist_result(url_entries, playlist_title = username)]
 176
 177
 178 class DepositFilesIE(InfoExtractor):
 179     """Information extractor for depositfiles.com"""
 180
 181     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 182
 183     def _real_extract(self, url):
 184         file_id = url.split('/')[-1]
 185         # Rebuild url in english locale
 186         url = 'http://depositfiles.com/en/files/' + file_id
 187
 188         # Retrieve file webpage with 'Free download' button pressed
 189         free_download_indication = { 'gateway_result' : '1' }
 190         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 191         try:
 192             self.report_download_webpage(file_id)
 193             webpage = compat_urllib_request.urlopen(request).read()
 194         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 195             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 196
 197         # Search for the real file URL
 198         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 199         if (mobj is None) or (mobj.group(1) is None):
 200             # Try to figure out reason of the error.
 201             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 202             if (mobj is not None) and (mobj.group(1) is not None):
 203                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 204                 raise ExtractorError(u'%s' % restriction_message)
 205             else:
 206                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 207
 208         file_url = mobj.group(1)
 209         file_extension = os.path.splitext(file_url)[1][1:]
 210
 211         # Search for file title
 212         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 213
 214         return [{
 215             'id':       file_id.decode('utf-8'),
 216             'url':      file_url.decode('utf-8'),
 217             'uploader': None,
 218             'upload_date':  None,
 219             'title':    file_title,
 220             'ext':      file_extension.decode('utf-8'),
 221         }]
 222
 223
 224 class FacebookIE(InfoExtractor):
 225     """Information Extractor for Facebook"""
 226
 227     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 228     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 229     _NETRC_MACHINE = 'facebook'
 230     IE_NAME = u'facebook'
 231
 232     def report_login(self):
 233         """Report attempt to log in."""
 234         self.to_screen(u'Logging in')
 235
 236     def _real_initialize(self):
 237         if self._downloader is None:
 238             return
 239
 240         useremail = None
 241         password = None
 242         downloader_params = self._downloader.params
 243
 244         # Attempt to use provided username and password or .netrc data
 245         if downloader_params.get('username', None) is not None:
 246             useremail = downloader_params['username']
 247             password = downloader_params['password']
 248         elif downloader_params.get('usenetrc', False):
 249             try:
 250                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 251                 if info is not None:
 252                     useremail = info[0]
 253                     password = info[2]
 254                 else:
 255                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 256             except (IOError, netrc.NetrcParseError) as err:
 257                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 258                 return
 259
 260         if useremail is None:
 261             return
 262
 263         # Log in
 264         login_form = {
 265             'email': useremail,
 266             'pass': password,
 267             'login': 'Log+In'
 268             }
 269         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 270         try:
 271             self.report_login()
 272             login_results = compat_urllib_request.urlopen(request).read()
 273             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 274                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 275                 return
 276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 277             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 278             return
 279
 280     def _real_extract(self, url):
 281         mobj = re.match(self._VALID_URL, url)
 282         if mobj is None:
 283             raise ExtractorError(u'Invalid URL: %s' % url)
 284         video_id = mobj.group('ID')
 285
 286         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 287         webpage = self._download_webpage(url, video_id)
 288
 289         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 290         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 291         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 292         if not m:
 293             raise ExtractorError(u'Cannot parse data')
 294         data = dict(json.loads(m.group(1)))
 295         params_raw = compat_urllib_parse.unquote(data['params'])
 296         params = json.loads(params_raw)
 297         video_data = params['video_data'][0]
 298         video_url = video_data.get('hd_src')
 299         if not video_url:
 300             video_url = video_data['sd_src']
 301         if not video_url:
 302             raise ExtractorError(u'Cannot find video URL')
 303         video_duration = int(video_data['video_duration'])
 304         thumbnail = video_data['thumbnail_src']
 305
 306         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 307             webpage, u'title')
 308
 309         info = {
 310             'id': video_id,
 311             'title': video_title,
 312             'url': video_url,
 313             'ext': 'mp4',
 314             'duration': video_duration,
 315             'thumbnail': thumbnail,
 316         }
 317         return [info]
 318
 319
 320 class BlipTVIE(InfoExtractor):
 321     """Information extractor for blip.tv"""
 322
 323     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 324     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 325     IE_NAME = u'blip.tv'
 326
 327     def report_direct_download(self, title):
 328         """Report information extraction."""
 329         self.to_screen(u'%s: Direct download detected' % title)
 330
 331     def _real_extract(self, url):
 332         mobj = re.match(self._VALID_URL, url)
 333         if mobj is None:
 334             raise ExtractorError(u'Invalid URL: %s' % url)
 335
 336         # See https://github.com/rg3/youtube-dl/issues/857
 337         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 338         if api_mobj is not None:
 339             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 340         urlp = compat_urllib_parse_urlparse(url)
 341         if urlp.path.startswith('/play/'):
 342             request = compat_urllib_request.Request(url)
 343             response = compat_urllib_request.urlopen(request)
 344             redirecturl = response.geturl()
 345             rurlp = compat_urllib_parse_urlparse(redirecturl)
 346             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 347             url = 'http://blip.tv/a/a-' + file_id
 348             return self._real_extract(url)
 349
 350
 351         if '?' in url:
 352             cchar = '&'
 353         else:
 354             cchar = '?'
 355         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 356         request = compat_urllib_request.Request(json_url)
 357         request.add_header('User-Agent', 'iTunes/10.6.1')
 358         self.report_extraction(mobj.group(1))
 359         info = None
 360         try:
 361             urlh = compat_urllib_request.urlopen(request)
 362             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 363                 basename = url.split('/')[-1]
 364                 title,ext = os.path.splitext(basename)
 365                 title = title.decode('UTF-8')
 366                 ext = ext.replace('.', '')
 367                 self.report_direct_download(title)
 368                 info = {
 369                     'id': title,
 370                     'url': url,
 371                     'uploader': None,
 372                     'upload_date': None,
 373                     'title': title,
 374                     'ext': ext,
 375                     'urlhandle': urlh
 376                 }
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 379         if info is None: # Regular URL
 380             try:
 381                 json_code_bytes = urlh.read()
 382                 json_code = json_code_bytes.decode('utf-8')
 383             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 384                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 385
 386             try:
 387                 json_data = json.loads(json_code)
 388                 if 'Post' in json_data:
 389                     data = json_data['Post']
 390                 else:
 391                     data = json_data
 392
 393                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 394                 video_url = data['media']['url']
 395                 umobj = re.match(self._URL_EXT, video_url)
 396                 if umobj is None:
 397                     raise ValueError('Can not determine filename extension')
 398                 ext = umobj.group(1)
 399
 400                 info = {
 401                     'id': data['item_id'],
 402                     'url': video_url,
 403                     'uploader': data['display_name'],
 404                     'upload_date': upload_date,
 405                     'title': data['title'],
 406                     'ext': ext,
 407                     'format': data['media']['mimeType'],
 408                     'thumbnail': data['thumbnailUrl'],
 409                     'description': data['description'],
 410                     'player_url': data['embedUrl'],
 411                     'user_agent': 'iTunes/10.6.1',
 412                 }
 413             except (ValueError,KeyError) as err:
 414                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 415
 416         return [info]
 417
 418
 419 class MyVideoIE(InfoExtractor):
 420     """Information Extractor for myvideo.de."""
 421
 422     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 423     IE_NAME = u'myvideo'
 424
 425     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 426     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 427     # https://github.com/rg3/youtube-dl/pull/842
 428     def __rc4crypt(self,data, key):
 429         x = 0
 430         box = list(range(256))
 431         for i in list(range(256)):
 432             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 433             box[i], box[x] = box[x], box[i]
 434         x = 0
 435         y = 0
 436         out = ''
 437         for char in data:
 438             x = (x + 1) % 256
 439             y = (y + box[x]) % 256
 440             box[x], box[y] = box[y], box[x]
 441             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 442         return out
 443
 444     def __md5(self,s):
 445         return hashlib.md5(s).hexdigest().encode()
 446
 447     def _real_extract(self,url):
 448         mobj = re.match(self._VALID_URL, url)
 449         if mobj is None:
 450             raise ExtractorError(u'invalid URL: %s' % url)
 451
 452         video_id = mobj.group(1)
 453
 454         GK = (
 455           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 456           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 457           b'TnpsbA0KTVRkbU1tSTRNdz09'
 458         )
 459
 460         # Get video webpage
 461         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 462         webpage = self._download_webpage(webpage_url, video_id)
 463
 464         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 465         if mobj is not None:
 466             self.report_extraction(video_id)
 467             video_url = mobj.group(1) + '.flv'
 468
 469             video_title = self._html_search_regex('<title>([^<]+)</title>',
 470                 webpage, u'title')
 471
 472             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 473
 474             return [{
 475                 'id':       video_id,
 476                 'url':      video_url,
 477                 'uploader': None,
 478                 'upload_date':  None,
 479                 'title':    video_title,
 480                 'ext':      u'flv',
 481             }]
 482
 483         # try encxml
 484         mobj = re.search('var flashvars={(.+?)}', webpage)
 485         if mobj is None:
 486             raise ExtractorError(u'Unable to extract video')
 487
 488         params = {}
 489         encxml = ''
 490         sec = mobj.group(1)
 491         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 492             if not a == '_encxml':
 493                 params[a] = b
 494             else:
 495                 encxml = compat_urllib_parse.unquote(b)
 496         if not params.get('domain'):
 497             params['domain'] = 'www.myvideo.de'
 498         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 499         if 'flash_playertype=MTV' in xmldata_url:
 500             self._downloader.report_warning(u'avoiding MTV player')
 501             xmldata_url = (
 502                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 503                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 504             ) % video_id
 505
 506         # get enc data
 507         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 508         enc_data_b = binascii.unhexlify(enc_data)
 509         sk = self.__md5(
 510             base64.b64decode(base64.b64decode(GK)) +
 511             self.__md5(
 512                 str(video_id).encode('utf-8')
 513             )
 514         )
 515         dec_data = self.__rc4crypt(enc_data_b, sk)
 516
 517         # extracting infos
 518         self.report_extraction(video_id)
 519
 520         video_url = None
 521         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 522         if mobj:
 523             video_url = compat_urllib_parse.unquote(mobj.group(1))
 524             if 'myvideo2flash' in video_url:
 525                 self._downloader.report_warning(u'forcing RTMPT ...')
 526                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 527
 528         if not video_url:
 529             # extract non rtmp videos
 530             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 531             if mobj is None:
 532                 raise ExtractorError(u'unable to extract url')
 533             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 534
 535         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 536         video_file = compat_urllib_parse.unquote(video_file)
 537
 538         if not video_file.endswith('f4m'):
 539             ppath, prefix = video_file.split('.')
 540             video_playpath = '%s:%s' % (prefix, ppath)
 541             video_hls_playlist = ''
 542         else:
 543             video_playpath = ''
 544             video_hls_playlist = (
 545                 video_filepath + video_file
 546             ).replace('.f4m', '.m3u8')
 547
 548         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 549         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 550
 551         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 552             webpage, u'title')
 553
 554         return [{
 555             'id':                 video_id,
 556             'url':                video_url,
 557             'tc_url':             video_url,
 558             'uploader':           None,
 559             'upload_date':        None,
 560             'title':              video_title,
 561             'ext':                u'flv',
 562             'play_path':          video_playpath,
 563             'video_file':         video_file,
 564             'video_hls_playlist': video_hls_playlist,
 565             'player_url':         video_swfobj,
 566         }]
 567
 568
 569 class ComedyCentralIE(InfoExtractor):
 570     """Information extractor for The Daily Show and Colbert Report """
 571
 572     # urls can be abbreviations like :thedailyshow or :colbert
 573     # urls for episodes like:
 574     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 575     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 576     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 577     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 578                       |(https?://)?(www\.)?
 579                           (?P<showname>thedailyshow|colbertnation)\.com/
 580                          (full-episodes/(?P<episode>.*)|
 581                           (?P<clip>
 582                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 583                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 584                      $"""
 585
 586     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 587
 588     _video_extensions = {
 589         '3500': 'mp4',
 590         '2200': 'mp4',
 591         '1700': 'mp4',
 592         '1200': 'mp4',
 593         '750': 'mp4',
 594         '400': 'mp4',
 595     }
 596     _video_dimensions = {
 597         '3500': '1280x720',
 598         '2200': '960x540',
 599         '1700': '768x432',
 600         '1200': '640x360',
 601         '750': '512x288',
 602         '400': '384x216',
 603     }
 604
 605     @classmethod
 606     def suitable(cls, url):
 607         """Receives a URL and returns True if suitable for this IE."""
 608         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 609
 610     def _print_formats(self, formats):
 611         print('Available formats:')
 612         for x in formats:
 613             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 614
 615
 616     def _real_extract(self, url):
 617         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 618         if mobj is None:
 619             raise ExtractorError(u'Invalid URL: %s' % url)
 620
 621         if mobj.group('shortname'):
 622             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 623                 url = u'http://www.thedailyshow.com/full-episodes/'
 624             else:
 625                 url = u'http://www.colbertnation.com/full-episodes/'
 626             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 627             assert mobj is not None
 628
 629         if mobj.group('clip'):
 630             if mobj.group('showname') == 'thedailyshow':
 631                 epTitle = mobj.group('tdstitle')
 632             else:
 633                 epTitle = mobj.group('cntitle')
 634             dlNewest = False
 635         else:
 636             dlNewest = not mobj.group('episode')
 637             if dlNewest:
 638                 epTitle = mobj.group('showname')
 639             else:
 640                 epTitle = mobj.group('episode')
 641
 642         self.report_extraction(epTitle)
 643         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 644         if dlNewest:
 645             url = htmlHandle.geturl()
 646             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 647             if mobj is None:
 648                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 649             if mobj.group('episode') == '':
 650                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 651             epTitle = mobj.group('episode')
 652
 653         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 654
 655         if len(mMovieParams) == 0:
 656             # The Colbert Report embeds the information in a without
 657             # a URL prefix; so extract the alternate reference
 658             # and then add the URL prefix manually.
 659
 660             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 661             if len(altMovieParams) == 0:
 662                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 663             else:
 664                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 665
 666         uri = mMovieParams[0][1]
 667         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 668         indexXml = self._download_webpage(indexUrl, epTitle,
 669                                           u'Downloading show index',
 670                                           u'unable to download episode index')
 671
 672         results = []
 673
 674         idoc = xml.etree.ElementTree.fromstring(indexXml)
 675         itemEls = idoc.findall('.//item')
 676         for partNum,itemEl in enumerate(itemEls):
 677             mediaId = itemEl.findall('./guid')[0].text
 678             shortMediaId = mediaId.split(':')[-1]
 679             showId = mediaId.split(':')[-2].replace('.com', '')
 680             officialTitle = itemEl.findall('./title')[0].text
 681             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 682
 683             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 684                         compat_urllib_parse.urlencode({'uri': mediaId}))
 685             configXml = self._download_webpage(configUrl, epTitle,
 686                                                u'Downloading configuration for %s' % shortMediaId)
 687
 688             cdoc = xml.etree.ElementTree.fromstring(configXml)
 689             turls = []
 690             for rendition in cdoc.findall('.//rendition'):
 691                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 692                 turls.append(finfo)
 693
 694             if len(turls) == 0:
 695                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 696                 continue
 697
 698             if self._downloader.params.get('listformats', None):
 699                 self._print_formats([i[0] for i in turls])
 700                 return
 701
 702             # For now, just pick the highest bitrate
 703             format,rtmp_video_url = turls[-1]
 704
 705             # Get the format arg from the arg stream
 706             req_format = self._downloader.params.get('format', None)
 707
 708             # Select format if we can find one
 709             for f,v in turls:
 710                 if f == req_format:
 711                     format, rtmp_video_url = f, v
 712                     break
 713
 714             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 715             if not m:
 716                 raise ExtractorError(u'Cannot transform RTMP url')
 717             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 718             video_url = base + m.group('finalid')
 719
 720             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 721             info = {
 722                 'id': shortMediaId,
 723                 'url': video_url,
 724                 'uploader': showId,
 725                 'upload_date': officialDate,
 726                 'title': effTitle,
 727                 'ext': 'mp4',
 728                 'format': format,
 729                 'thumbnail': None,
 730                 'description': officialTitle,
 731             }
 732             results.append(info)
 733
 734         return results
 735
 736
 737 class EscapistIE(InfoExtractor):
 738     """Information extractor for The Escapist """
 739
 740     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 741     IE_NAME = u'escapist'
 742
 743     def _real_extract(self, url):
 744         mobj = re.match(self._VALID_URL, url)
 745         if mobj is None:
 746             raise ExtractorError(u'Invalid URL: %s' % url)
 747         showName = mobj.group('showname')
 748         videoId = mobj.group('episode')
 749
 750         self.report_extraction(videoId)
 751         webpage = self._download_webpage(url, videoId)
 752
 753         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 754             webpage, u'description', fatal=False)
 755
 756         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 757             webpage, u'thumbnail', fatal=False)
 758
 759         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 760             webpage, u'player url')
 761
 762         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 763             webpage, u'player url').split(' : ')[-1]
 764
 765         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 766         configUrl = compat_urllib_parse.unquote(configUrl)
 767
 768         configJSON = self._download_webpage(configUrl, videoId,
 769                                             u'Downloading configuration',
 770                                             u'unable to download configuration')
 771
 772         # Technically, it's JavaScript, not JSON
 773         configJSON = configJSON.replace("'", '"')
 774
 775         try:
 776             config = json.loads(configJSON)
 777         except (ValueError,) as err:
 778             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 779
 780         playlist = config['playlist']
 781         videoUrl = playlist[1]['url']
 782
 783         info = {
 784             'id': videoId,
 785             'url': videoUrl,
 786             'uploader': showName,
 787             'upload_date': None,
 788             'title': title,
 789             'ext': 'mp4',
 790             'thumbnail': imgUrl,
 791             'description': videoDesc,
 792             'player_url': playerUrl,
 793         }
 794
 795         return [info]
 796
 797 class CollegeHumorIE(InfoExtractor):
 798     """Information extractor for collegehumor.com"""
 799
 800     _WORKING = False
 801     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 802     IE_NAME = u'collegehumor'
 803
 804     def report_manifest(self, video_id):
 805         """Report information extraction."""
 806         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 807
 808     def _real_extract(self, url):
 809         mobj = re.match(self._VALID_URL, url)
 810         if mobj is None:
 811             raise ExtractorError(u'Invalid URL: %s' % url)
 812         video_id = mobj.group('videoid')
 813
 814         info = {
 815             'id': video_id,
 816             'uploader': None,
 817             'upload_date': None,
 818         }
 819
 820         self.report_extraction(video_id)
 821         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 822         try:
 823             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 824         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 825             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 826
 827         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 828         try:
 829             videoNode = mdoc.findall('./video')[0]
 830             info['description'] = videoNode.findall('./description')[0].text
 831             info['title'] = videoNode.findall('./caption')[0].text
 832             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 833             manifest_url = videoNode.findall('./file')[0].text
 834         except IndexError:
 835             raise ExtractorError(u'Invalid metadata XML file')
 836
 837         manifest_url += '?hdcore=2.10.3'
 838         self.report_manifest(video_id)
 839         try:
 840             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 841         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 842             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 843
 844         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 845         try:
 846             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 847             node_id = media_node.attrib['url']
 848             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 849         except IndexError as err:
 850             raise ExtractorError(u'Invalid manifest file')
 851
 852         url_pr = compat_urllib_parse_urlparse(manifest_url)
 853         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 854
 855         info['url'] = url
 856         info['ext'] = 'f4f'
 857         return [info]
 858
 859
 860 class XVideosIE(InfoExtractor):
 861     """Information extractor for xvideos.com"""
 862
 863     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 864     IE_NAME = u'xvideos'
 865
 866     def _real_extract(self, url):
 867         mobj = re.match(self._VALID_URL, url)
 868         if mobj is None:
 869             raise ExtractorError(u'Invalid URL: %s' % url)
 870         video_id = mobj.group(1)
 871
 872         webpage = self._download_webpage(url, video_id)
 873
 874         self.report_extraction(video_id)
 875
 876         # Extract video URL
 877         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 878             webpage, u'video URL'))
 879
 880         # Extract title
 881         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 882             webpage, u'title')
 883
 884         # Extract video thumbnail
 885         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 886             webpage, u'thumbnail', fatal=False)
 887
 888         info = {
 889             'id': video_id,
 890             'url': video_url,
 891             'uploader': None,
 892             'upload_date': None,
 893             'title': video_title,
 894             'ext': 'flv',
 895             'thumbnail': video_thumbnail,
 896             'description': None,
 897         }
 898
 899         return [info]
 900
 901
 902 class SoundcloudIE(InfoExtractor):
 903     """Information extractor for soundcloud.com
 904        To access the media, the uid of the song and a stream token
 905        must be extracted from the page source and the script must make
 906        a request to media.soundcloud.com/crossdomain.xml. Then
 907        the media can be grabbed by requesting from an url composed
 908        of the stream token and uid
 909      """
 910
 911     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 912     IE_NAME = u'soundcloud'
 913
 914     def report_resolve(self, video_id):
 915         """Report information extraction."""
 916         self.to_screen(u'%s: Resolving id' % video_id)
 917
 918     def _real_extract(self, url):
 919         mobj = re.match(self._VALID_URL, url)
 920         if mobj is None:
 921             raise ExtractorError(u'Invalid URL: %s' % url)
 922
 923         # extract uploader (which is in the url)
 924         uploader = mobj.group(1)
 925         # extract simple title (uploader + slug of song title)
 926         slug_title =  mobj.group(2)
 927         simple_title = uploader + u'-' + slug_title
 928         full_title = '%s/%s' % (uploader, slug_title)
 929
 930         self.report_resolve(full_title)
 931
 932         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 933         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 934         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 935
 936         info = json.loads(info_json)
 937         video_id = info['id']
 938         self.report_extraction(full_title)
 939
 940         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 941         stream_json = self._download_webpage(streams_url, full_title,
 942                                              u'Downloading stream definitions',
 943                                              u'unable to download stream definitions')
 944
 945         streams = json.loads(stream_json)
 946         mediaURL = streams['http_mp3_128_url']
 947         upload_date = unified_strdate(info['created_at'])
 948
 949         return [{
 950             'id':       info['id'],
 951             'url':      mediaURL,
 952             'uploader': info['user']['username'],
 953             'upload_date': upload_date,
 954             'title':    info['title'],
 955             'ext':      u'mp3',
 956             'description': info['description'],
 957         }]
 958
 959 class SoundcloudSetIE(InfoExtractor):
 960     """Information extractor for soundcloud.com sets
 961        To access the media, the uid of the song and a stream token
 962        must be extracted from the page source and the script must make
 963        a request to media.soundcloud.com/crossdomain.xml. Then
 964        the media can be grabbed by requesting from an url composed
 965        of the stream token and uid
 966      """
 967
 968     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 969     IE_NAME = u'soundcloud:set'
 970
 971     def report_resolve(self, video_id):
 972         """Report information extraction."""
 973         self.to_screen(u'%s: Resolving id' % video_id)
 974
 975     def _real_extract(self, url):
 976         mobj = re.match(self._VALID_URL, url)
 977         if mobj is None:
 978             raise ExtractorError(u'Invalid URL: %s' % url)
 979
 980         # extract uploader (which is in the url)
 981         uploader = mobj.group(1)
 982         # extract simple title (uploader + slug of song title)
 983         slug_title =  mobj.group(2)
 984         simple_title = uploader + u'-' + slug_title
 985         full_title = '%s/sets/%s' % (uploader, slug_title)
 986
 987         self.report_resolve(full_title)
 988
 989         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 990         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 991         info_json = self._download_webpage(resolv_url, full_title)
 992
 993         videos = []
 994         info = json.loads(info_json)
 995         if 'errors' in info:
 996             for err in info['errors']:
 997                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 998             return
 999
1000         self.report_extraction(full_title)
1001         for track in info['tracks']:
1002             video_id = track['id']
1003
1004             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1005             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1006
1007             self.report_extraction(video_id)
1008             streams = json.loads(stream_json)
1009             mediaURL = streams['http_mp3_128_url']
1010
1011             videos.append({
1012                 'id':       video_id,
1013                 'url':      mediaURL,
1014                 'uploader': track['user']['username'],
1015                 'upload_date':  unified_strdate(track['created_at']),
1016                 'title':    track['title'],
1017                 'ext':      u'mp3',
1018                 'description': track['description'],
1019             })
1020         return videos
1021
1022
1023 class InfoQIE(InfoExtractor):
1024     """Information extractor for infoq.com"""
1025     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1026
1027     def _real_extract(self, url):
1028         mobj = re.match(self._VALID_URL, url)
1029         if mobj is None:
1030             raise ExtractorError(u'Invalid URL: %s' % url)
1031
1032         webpage = self._download_webpage(url, video_id=url)
1033         self.report_extraction(url)
1034
1035         # Extract video URL
1036         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1037         if mobj is None:
1038             raise ExtractorError(u'Unable to extract video url')
1039         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1040         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1041
1042         # Extract title
1043         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1044             webpage, u'title')
1045
1046         # Extract description
1047         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1048             webpage, u'description', fatal=False)
1049
1050         video_filename = video_url.split('/')[-1]
1051         video_id, extension = video_filename.split('.')
1052
1053         info = {
1054             'id': video_id,
1055             'url': video_url,
1056             'uploader': None,
1057             'upload_date': None,
1058             'title': video_title,
1059             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1060             'thumbnail': None,
1061             'description': video_description,
1062         }
1063
1064         return [info]
1065
1066 class MixcloudIE(InfoExtractor):
1067     """Information extractor for www.mixcloud.com"""
1068
1069     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1070     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1071     IE_NAME = u'mixcloud'
1072
1073     def report_download_json(self, file_id):
1074         """Report JSON download."""
1075         self.to_screen(u'Downloading json')
1076
1077     def get_urls(self, jsonData, fmt, bitrate='best'):
1078         """Get urls from 'audio_formats' section in json"""
1079         file_url = None
1080         try:
1081             bitrate_list = jsonData[fmt]
1082             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1083                 bitrate = max(bitrate_list) # select highest
1084
1085             url_list = jsonData[fmt][bitrate]
1086         except TypeError: # we have no bitrate info.
1087             url_list = jsonData[fmt]
1088         return url_list
1089
1090     def check_urls(self, url_list):
1091         """Returns 1st active url from list"""
1092         for url in url_list:
1093             try:
1094                 compat_urllib_request.urlopen(url)
1095                 return url
1096             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1097                 url = None
1098
1099         return None
1100
1101     def _print_formats(self, formats):
1102         print('Available formats:')
1103         for fmt in formats.keys():
1104             for b in formats[fmt]:
1105                 try:
1106                     ext = formats[fmt][b][0]
1107                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1108                 except TypeError: # we have no bitrate info
1109                     ext = formats[fmt][0]
1110                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1111                     break
1112
1113     def _real_extract(self, url):
1114         mobj = re.match(self._VALID_URL, url)
1115         if mobj is None:
1116             raise ExtractorError(u'Invalid URL: %s' % url)
1117         # extract uploader & filename from url
1118         uploader = mobj.group(1).decode('utf-8')
1119         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1120
1121         # construct API request
1122         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1123         # retrieve .json file with links to files
1124         request = compat_urllib_request.Request(file_url)
1125         try:
1126             self.report_download_json(file_url)
1127             jsonData = compat_urllib_request.urlopen(request).read()
1128         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1130
1131         # parse JSON
1132         json_data = json.loads(jsonData)
1133         player_url = json_data['player_swf_url']
1134         formats = dict(json_data['audio_formats'])
1135
1136         req_format = self._downloader.params.get('format', None)
1137         bitrate = None
1138
1139         if self._downloader.params.get('listformats', None):
1140             self._print_formats(formats)
1141             return
1142
1143         if req_format is None or req_format == 'best':
1144             for format_param in formats.keys():
1145                 url_list = self.get_urls(formats, format_param)
1146                 # check urls
1147                 file_url = self.check_urls(url_list)
1148                 if file_url is not None:
1149                     break # got it!
1150         else:
1151             if req_format not in formats:
1152                 raise ExtractorError(u'Format is not available')
1153
1154             url_list = self.get_urls(formats, req_format)
1155             file_url = self.check_urls(url_list)
1156             format_param = req_format
1157
1158         return [{
1159             'id': file_id.decode('utf-8'),
1160             'url': file_url.decode('utf-8'),
1161             'uploader': uploader.decode('utf-8'),
1162             'upload_date': None,
1163             'title': json_data['name'],
1164             'ext': file_url.split('.')[-1].decode('utf-8'),
1165             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1166             'thumbnail': json_data['thumbnail_url'],
1167             'description': json_data['description'],
1168             'player_url': player_url.decode('utf-8'),
1169         }]
1170
1171 class StanfordOpenClassroomIE(InfoExtractor):
1172     """Information extractor for Stanford's Open ClassRoom"""
1173
1174     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1175     IE_NAME = u'stanfordoc'
1176
1177     def _real_extract(self, url):
1178         mobj = re.match(self._VALID_URL, url)
1179         if mobj is None:
1180             raise ExtractorError(u'Invalid URL: %s' % url)
1181
1182         if mobj.group('course') and mobj.group('video'): # A specific video
1183             course = mobj.group('course')
1184             video = mobj.group('video')
1185             info = {
1186                 'id': course + '_' + video,
1187                 'uploader': None,
1188                 'upload_date': None,
1189             }
1190
1191             self.report_extraction(info['id'])
1192             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1193             xmlUrl = baseUrl + video + '.xml'
1194             try:
1195                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1196             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1197                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1198             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1199             try:
1200                 info['title'] = mdoc.findall('./title')[0].text
1201                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1202             except IndexError:
1203                 raise ExtractorError(u'Invalid metadata XML file')
1204             info['ext'] = info['url'].rpartition('.')[2]
1205             return [info]
1206         elif mobj.group('course'): # A course page
1207             course = mobj.group('course')
1208             info = {
1209                 'id': course,
1210                 'type': 'playlist',
1211                 'uploader': None,
1212                 'upload_date': None,
1213             }
1214
1215             coursepage = self._download_webpage(url, info['id'],
1216                                         note='Downloading course info page',
1217                                         errnote='Unable to download course info page')
1218
1219             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1220
1221             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1222                 coursepage, u'description', fatal=False)
1223
1224             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1225             info['list'] = [
1226                 {
1227                     'type': 'reference',
1228                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1229                 }
1230                     for vpage in links]
1231             results = []
1232             for entry in info['list']:
1233                 assert entry['type'] == 'reference'
1234                 results += self.extract(entry['url'])
1235             return results
1236         else: # Root page
1237             info = {
1238                 'id': 'Stanford OpenClassroom',
1239                 'type': 'playlist',
1240                 'uploader': None,
1241                 'upload_date': None,
1242             }
1243
1244             self.report_download_webpage(info['id'])
1245             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1246             try:
1247                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1248             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1249                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1250
1251             info['title'] = info['id']
1252
1253             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1254             info['list'] = [
1255                 {
1256                     'type': 'reference',
1257                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1258                 }
1259                     for cpage in links]
1260
1261             results = []
1262             for entry in info['list']:
1263                 assert entry['type'] == 'reference'
1264                 results += self.extract(entry['url'])
1265             return results
1266
1267 class MTVIE(InfoExtractor):
1268     """Information extractor for MTV.com"""
1269
1270     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1271     IE_NAME = u'mtv'
1272
1273     def _real_extract(self, url):
1274         mobj = re.match(self._VALID_URL, url)
1275         if mobj is None:
1276             raise ExtractorError(u'Invalid URL: %s' % url)
1277         if not mobj.group('proto'):
1278             url = 'http://' + url
1279         video_id = mobj.group('videoid')
1280
1281         webpage = self._download_webpage(url, video_id)
1282
1283         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1284             webpage, u'song name', fatal=False)
1285
1286         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1287             webpage, u'title')
1288
1289         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1290             webpage, u'mtvn_uri', fatal=False)
1291
1292         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1293             webpage, u'content id', fatal=False)
1294
1295         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1296         self.report_extraction(video_id)
1297         request = compat_urllib_request.Request(videogen_url)
1298         try:
1299             metadataXml = compat_urllib_request.urlopen(request).read()
1300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1301             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1302
1303         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1304         renditions = mdoc.findall('.//rendition')
1305
1306         # For now, always pick the highest quality.
1307         rendition = renditions[-1]
1308
1309         try:
1310             _,_,ext = rendition.attrib['type'].partition('/')
1311             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1312             video_url = rendition.find('./src').text
1313         except KeyError:
1314             raise ExtractorError('Invalid rendition field.')
1315
1316         info = {
1317             'id': video_id,
1318             'url': video_url,
1319             'uploader': performer,
1320             'upload_date': None,
1321             'title': video_title,
1322             'ext': ext,
1323             'format': format,
1324         }
1325
1326         return [info]
1327
1328
1329 class YoukuIE(InfoExtractor):
1330     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1331
1332     def _gen_sid(self):
1333         nowTime = int(time.time() * 1000)
1334         random1 = random.randint(1000,1998)
1335         random2 = random.randint(1000,9999)
1336
1337         return "%d%d%d" %(nowTime,random1,random2)
1338
1339     def _get_file_ID_mix_string(self, seed):
1340         mixed = []
1341         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1342         seed = float(seed)
1343         for i in range(len(source)):
1344             seed  =  (seed * 211 + 30031 ) % 65536
1345             index  =  math.floor(seed / 65536 * len(source) )
1346             mixed.append(source[int(index)])
1347             source.remove(source[int(index)])
1348         #return ''.join(mixed)
1349         return mixed
1350
1351     def _get_file_id(self, fileId, seed):
1352         mixed = self._get_file_ID_mix_string(seed)
1353         ids = fileId.split('*')
1354         realId = []
1355         for ch in ids:
1356             if ch:
1357                 realId.append(mixed[int(ch)])
1358         return ''.join(realId)
1359
1360     def _real_extract(self, url):
1361         mobj = re.match(self._VALID_URL, url)
1362         if mobj is None:
1363             raise ExtractorError(u'Invalid URL: %s' % url)
1364         video_id = mobj.group('ID')
1365
1366         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1367
1368         jsondata = self._download_webpage(info_url, video_id)
1369
1370         self.report_extraction(video_id)
1371         try:
1372             config = json.loads(jsondata)
1373
1374             video_title =  config['data'][0]['title']
1375             seed = config['data'][0]['seed']
1376
1377             format = self._downloader.params.get('format', None)
1378             supported_format = list(config['data'][0]['streamfileids'].keys())
1379
1380             if format is None or format == 'best':
1381                 if 'hd2' in supported_format:
1382                     format = 'hd2'
1383                 else:
1384                     format = 'flv'
1385                 ext = u'flv'
1386             elif format == 'worst':
1387                 format = 'mp4'
1388                 ext = u'mp4'
1389             else:
1390                 format = 'flv'
1391                 ext = u'flv'
1392
1393
1394             fileid = config['data'][0]['streamfileids'][format]
1395             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1396         except (UnicodeDecodeError, ValueError, KeyError):
1397             raise ExtractorError(u'Unable to extract info section')
1398
1399         files_info=[]
1400         sid = self._gen_sid()
1401         fileid = self._get_file_id(fileid, seed)
1402
1403         #column 8,9 of fileid represent the segment number
1404         #fileid[7:9] should be changed
1405         for index, key in enumerate(keys):
1406
1407             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1408             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1409
1410             info = {
1411                 'id': '%s_part%02d' % (video_id, index),
1412                 'url': download_url,
1413                 'uploader': None,
1414                 'upload_date': None,
1415                 'title': video_title,
1416                 'ext': ext,
1417             }
1418             files_info.append(info)
1419
1420         return files_info
1421
1422
1423 class XNXXIE(InfoExtractor):
1424     """Information extractor for xnxx.com"""
1425
1426     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1427     IE_NAME = u'xnxx'
1428     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1429     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1430     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1431
1432     def _real_extract(self, url):
1433         mobj = re.match(self._VALID_URL, url)
1434         if mobj is None:
1435             raise ExtractorError(u'Invalid URL: %s' % url)
1436         video_id = mobj.group(1)
1437
1438         # Get webpage content
1439         webpage = self._download_webpage(url, video_id)
1440
1441         video_url = self._search_regex(self.VIDEO_URL_RE,
1442             webpage, u'video URL')
1443         video_url = compat_urllib_parse.unquote(video_url)
1444
1445         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1446             webpage, u'title')
1447
1448         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1449             webpage, u'thumbnail', fatal=False)
1450
1451         return [{
1452             'id': video_id,
1453             'url': video_url,
1454             'uploader': None,
1455             'upload_date': None,
1456             'title': video_title,
1457             'ext': 'flv',
1458             'thumbnail': video_thumbnail,
1459             'description': None,
1460         }]
1461
1462
1463 class GooglePlusIE(InfoExtractor):
1464     """Information extractor for plus.google.com."""
1465
1466     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1467     IE_NAME = u'plus.google'
1468
1469     def _real_extract(self, url):
1470         # Extract id from URL
1471         mobj = re.match(self._VALID_URL, url)
1472         if mobj is None:
1473             raise ExtractorError(u'Invalid URL: %s' % url)
1474
1475         post_url = mobj.group(0)
1476         video_id = mobj.group(1)
1477
1478         video_extension = 'flv'
1479
1480         # Step 1, Retrieve post webpage to extract further information
1481         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1482
1483         self.report_extraction(video_id)
1484
1485         # Extract update date
1486         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1487             webpage, u'upload date', fatal=False)
1488         if upload_date:
1489             # Convert timestring to a format suitable for filename
1490             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1491             upload_date = upload_date.strftime('%Y%m%d')
1492
1493         # Extract uploader
1494         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1495             webpage, u'uploader', fatal=False)
1496
1497         # Extract title
1498         # Get the first line for title
1499         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1500             webpage, 'title', default=u'NA')
1501
1502         # Step 2, Stimulate clicking the image box to launch video
1503         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1504             webpage, u'video page URL')
1505         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1506
1507         # Extract video links on video page
1508         """Extract video links of all sizes"""
1509         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1510         mobj = re.findall(pattern, webpage)
1511         if len(mobj) == 0:
1512             raise ExtractorError(u'Unable to extract video links')
1513
1514         # Sort in resolution
1515         links = sorted(mobj)
1516
1517         # Choose the lowest of the sort, i.e. highest resolution
1518         video_url = links[-1]
1519         # Only get the url. The resolution part in the tuple has no use anymore
1520         video_url = video_url[-1]
1521         # Treat escaped \u0026 style hex
1522         try:
1523             video_url = video_url.decode("unicode_escape")
1524         except AttributeError: # Python 3
1525             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1526
1527
1528         return [{
1529             'id':       video_id,
1530             'url':      video_url,
1531             'uploader': uploader,
1532             'upload_date':  upload_date,
1533             'title':    video_title,
1534             'ext':      video_extension,
1535         }]
1536
1537 class NBAIE(InfoExtractor):
1538     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1539     IE_NAME = u'nba'
1540
1541     def _real_extract(self, url):
1542         mobj = re.match(self._VALID_URL, url)
1543         if mobj is None:
1544             raise ExtractorError(u'Invalid URL: %s' % url)
1545
1546         video_id = mobj.group(1)
1547
1548         webpage = self._download_webpage(url, video_id)
1549
1550         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1551
1552         shortened_video_id = video_id.rpartition('/')[2]
1553         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1554             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1555
1556         # It isn't there in the HTML it returns to us
1557         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1558
1559         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1560
1561         info = {
1562             'id': shortened_video_id,
1563             'url': video_url,
1564             'ext': 'mp4',
1565             'title': title,
1566             # 'uploader_date': uploader_date,
1567             'description': description,
1568         }
1569         return [info]
1570
1571 class JustinTVIE(InfoExtractor):
1572     """Information extractor for justin.tv and twitch.tv"""
1573     # TODO: One broadcast may be split into multiple videos. The key
1574     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1575     # starts at 1 and increases. Can we treat all parts as one video?
1576
1577     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1578         (?:
1579             (?P<channelid>[^/]+)|
1580             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1581             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1582         )
1583         /?(?:\#.*)?$
1584         """
1585     _JUSTIN_PAGE_LIMIT = 100
1586     IE_NAME = u'justin.tv'
1587
1588     def report_download_page(self, channel, offset):
1589         """Report attempt to download a single page of videos."""
1590         self.to_screen(u'%s: Downloading video information from %d to %d' %
1591                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1592
1593     # Return count of items, list of *valid* items
1594     def _parse_page(self, url, video_id):
1595         webpage = self._download_webpage(url, video_id,
1596                                          u'Downloading video info JSON',
1597                                          u'unable to download video info JSON')
1598
1599         response = json.loads(webpage)
1600         if type(response) != list:
1601             error_text = response.get('error', 'unknown error')
1602             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1603         info = []
1604         for clip in response:
1605             video_url = clip['video_file_url']
1606             if video_url:
1607                 video_extension = os.path.splitext(video_url)[1][1:]
1608                 video_date = re.sub('-', '', clip['start_time'][:10])
1609                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1610                 video_id = clip['id']
1611                 video_title = clip.get('title', video_id)
1612                 info.append({
1613                     'id': video_id,
1614                     'url': video_url,
1615                     'title': video_title,
1616                     'uploader': clip.get('channel_name', video_uploader_id),
1617                     'uploader_id': video_uploader_id,
1618                     'upload_date': video_date,
1619                     'ext': video_extension,
1620                 })
1621         return (len(response), info)
1622
1623     def _real_extract(self, url):
1624         mobj = re.match(self._VALID_URL, url)
1625         if mobj is None:
1626             raise ExtractorError(u'invalid URL: %s' % url)
1627
1628         api_base = 'http://api.justin.tv'
1629         paged = False
1630         if mobj.group('channelid'):
1631             paged = True
1632             video_id = mobj.group('channelid')
1633             api = api_base + '/channel/archives/%s.json' % video_id
1634         elif mobj.group('chapterid'):
1635             chapter_id = mobj.group('chapterid')
1636
1637             webpage = self._download_webpage(url, chapter_id)
1638             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1639             if not m:
1640                 raise ExtractorError(u'Cannot find archive of a chapter')
1641             archive_id = m.group(1)
1642
1643             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1644             chapter_info_xml = self._download_webpage(api, chapter_id,
1645                                              note=u'Downloading chapter information',
1646                                              errnote=u'Chapter information download failed')
1647             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1648             for a in doc.findall('.//archive'):
1649                 if archive_id == a.find('./id').text:
1650                     break
1651             else:
1652                 raise ExtractorError(u'Could not find chapter in chapter information')
1653
1654             video_url = a.find('./video_file_url').text
1655             video_ext = video_url.rpartition('.')[2] or u'flv'
1656
1657             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1658             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1659                                    note='Downloading chapter metadata',
1660                                    errnote='Download of chapter metadata failed')
1661             chapter_info = json.loads(chapter_info_json)
1662
1663             bracket_start = int(doc.find('.//bracket_start').text)
1664             bracket_end = int(doc.find('.//bracket_end').text)
1665
1666             # TODO determine start (and probably fix up file)
1667             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1668             #video_url += u'?start=' + TODO:start_timestamp
1669             # bracket_start is 13290, but we want 51670615
1670             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1671                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1672
1673             info = {
1674                 'id': u'c' + chapter_id,
1675                 'url': video_url,
1676                 'ext': video_ext,
1677                 'title': chapter_info['title'],
1678                 'thumbnail': chapter_info['preview'],
1679                 'description': chapter_info['description'],
1680                 'uploader': chapter_info['channel']['display_name'],
1681                 'uploader_id': chapter_info['channel']['name'],
1682             }
1683             return [info]
1684         else:
1685             video_id = mobj.group('videoid')
1686             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1687
1688         self.report_extraction(video_id)
1689
1690         info = []
1691         offset = 0
1692         limit = self._JUSTIN_PAGE_LIMIT
1693         while True:
1694             if paged:
1695                 self.report_download_page(video_id, offset)
1696             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1697             page_count, page_info = self._parse_page(page_url, video_id)
1698             info.extend(page_info)
1699             if not paged or page_count != limit:
1700                 break
1701             offset += limit
1702         return info
1703
1704 class FunnyOrDieIE(InfoExtractor):
1705     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1706
1707     def _real_extract(self, url):
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             raise ExtractorError(u'invalid URL: %s' % url)
1711
1712         video_id = mobj.group('id')
1713         webpage = self._download_webpage(url, video_id)
1714
1715         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1716             webpage, u'video URL', flags=re.DOTALL)
1717
1718         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1719             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1720
1721         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1722             webpage, u'description', fatal=False, flags=re.DOTALL)
1723
1724         info = {
1725             'id': video_id,
1726             'url': video_url,
1727             'ext': 'mp4',
1728             'title': title,
1729             'description': video_description,
1730         }
1731         return [info]
1732
1733 class SteamIE(InfoExtractor):
1734     _VALID_URL = r"""http://store\.steampowered\.com/
1735                 (agecheck/)?
1736                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1737                 (?P<gameID>\d+)/?
1738                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1739                 """
1740     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1741     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1742
1743     @classmethod
1744     def suitable(cls, url):
1745         """Receives a URL and returns True if suitable for this IE."""
1746         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1747
1748     def _real_extract(self, url):
1749         m = re.match(self._VALID_URL, url, re.VERBOSE)
1750         gameID = m.group('gameID')
1751
1752         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1753         webpage = self._download_webpage(videourl, gameID)
1754
1755         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1756             videourl = self._AGECHECK_TEMPLATE % gameID
1757             self.report_age_confirmation()
1758             webpage = self._download_webpage(videourl, gameID)
1759
1760         self.report_extraction(gameID)
1761         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1762                                              webpage, 'game title')
1763
1764         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1765         mweb = re.finditer(urlRE, webpage)
1766         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1767         titles = re.finditer(namesRE, webpage)
1768         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1769         thumbs = re.finditer(thumbsRE, webpage)
1770         videos = []
1771         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1772             video_id = vid.group('videoID')
1773             title = vtitle.group('videoName')
1774             video_url = vid.group('videoURL')
1775             video_thumb = thumb.group('thumbnail')
1776             if not video_url:
1777                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1778             info = {
1779                 'id':video_id,
1780                 'url':video_url,
1781                 'ext': 'flv',
1782                 'title': unescapeHTML(title),
1783                 'thumbnail': video_thumb
1784                   }
1785             videos.append(info)
1786         return [self.playlist_result(videos, gameID, game_title)]
1787
1788 class UstreamIE(InfoExtractor):
1789     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1790     IE_NAME = u'ustream'
1791
1792     def _real_extract(self, url):
1793         m = re.match(self._VALID_URL, url)
1794         video_id = m.group('videoID')
1795
1796         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1797         webpage = self._download_webpage(url, video_id)
1798
1799         self.report_extraction(video_id)
1800
1801         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1802             webpage, u'title')
1803
1804         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1805             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1806
1807         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1808             webpage, u'thumbnail', fatal=False)
1809
1810         info = {
1811                 'id': video_id,
1812                 'url': video_url,
1813                 'ext': 'flv',
1814                 'title': video_title,
1815                 'uploader': uploader,
1816                 'thumbnail': thumbnail,
1817                }
1818         return info
1819
1820 class WorldStarHipHopIE(InfoExtractor):
1821     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1822     IE_NAME = u'WorldStarHipHop'
1823
1824     def _real_extract(self, url):
1825         m = re.match(self._VALID_URL, url)
1826         video_id = m.group('id')
1827
1828         webpage_src = self._download_webpage(url, video_id)
1829
1830         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1831             webpage_src, u'video URL')
1832
1833         if 'mp4' in video_url:
1834             ext = 'mp4'
1835         else:
1836             ext = 'flv'
1837
1838         video_title = self._html_search_regex(r"<title>(.*)</title>",
1839             webpage_src, u'title')
1840
1841         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1842         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1843             webpage_src, u'thumbnail', fatal=False)
1844
1845         if not thumbnail:
1846             _title = r"""candytitles.*>(.*)</span>"""
1847             mobj = re.search(_title, webpage_src)
1848             if mobj is not None:
1849                 video_title = mobj.group(1)
1850
1851         results = [{
1852                     'id': video_id,
1853                     'url' : video_url,
1854                     'title' : video_title,
1855                     'thumbnail' : thumbnail,
1856                     'ext' : ext,
1857                     }]
1858         return results
1859
1860 class RBMARadioIE(InfoExtractor):
1861     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1862
1863     def _real_extract(self, url):
1864         m = re.match(self._VALID_URL, url)
1865         video_id = m.group('videoID')
1866
1867         webpage = self._download_webpage(url, video_id)
1868
1869         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1870             webpage, u'json data', flags=re.MULTILINE)
1871
1872         try:
1873             data = json.loads(json_data)
1874         except ValueError as e:
1875             raise ExtractorError(u'Invalid JSON: ' + str(e))
1876
1877         video_url = data['akamai_url'] + '&cbr=256'
1878         url_parts = compat_urllib_parse_urlparse(video_url)
1879         video_ext = url_parts.path.rpartition('.')[2]
1880         info = {
1881                 'id': video_id,
1882                 'url': video_url,
1883                 'ext': video_ext,
1884                 'title': data['title'],
1885                 'description': data.get('teaser_text'),
1886                 'location': data.get('country_of_origin'),
1887                 'uploader': data.get('host', {}).get('name'),
1888                 'uploader_id': data.get('host', {}).get('slug'),
1889                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1890                 'duration': data.get('duration'),
1891         }
1892         return [info]
1893
1894
1895 class YouPornIE(InfoExtractor):
1896     """Information extractor for youporn.com."""
1897     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1898
1899     def _print_formats(self, formats):
1900         """Print all available formats"""
1901         print(u'Available formats:')
1902         print(u'ext\t\tformat')
1903         print(u'---------------------------------')
1904         for format in formats:
1905             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1906
1907     def _specific(self, req_format, formats):
1908         for x in formats:
1909             if(x["format"]==req_format):
1910                 return x
1911         return None
1912
1913     def _real_extract(self, url):
1914         mobj = re.match(self._VALID_URL, url)
1915         if mobj is None:
1916             raise ExtractorError(u'Invalid URL: %s' % url)
1917         video_id = mobj.group('videoid')
1918
1919         req = compat_urllib_request.Request(url)
1920         req.add_header('Cookie', 'age_verified=1')
1921         webpage = self._download_webpage(req, video_id)
1922
1923         # Get JSON parameters
1924         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1925         try:
1926             params = json.loads(json_params)
1927         except:
1928             raise ExtractorError(u'Invalid JSON')
1929
1930         self.report_extraction(video_id)
1931         try:
1932             video_title = params['title']
1933             upload_date = unified_strdate(params['release_date_f'])
1934             video_description = params['description']
1935             video_uploader = params['submitted_by']
1936             thumbnail = params['thumbnails'][0]['image']
1937         except KeyError:
1938             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1939
1940         # Get all of the formats available
1941         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1942         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1943             webpage, u'download list').strip()
1944
1945         # Get all of the links from the page
1946         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1947         links = re.findall(LINK_RE, download_list_html)
1948         if(len(links) == 0):
1949             raise ExtractorError(u'ERROR: no known formats available for video')
1950
1951         self.to_screen(u'Links found: %d' % len(links))
1952
1953         formats = []
1954         for link in links:
1955
1956             # A link looks like this:
1957             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1958             # A path looks like this:
1959             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1960             video_url = unescapeHTML( link )
1961             path = compat_urllib_parse_urlparse( video_url ).path
1962             extension = os.path.splitext( path )[1][1:]
1963             format = path.split('/')[4].split('_')[:2]
1964             size = format[0]
1965             bitrate = format[1]
1966             format = "-".join( format )
1967             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1968
1969             formats.append({
1970                 'id': video_id,
1971                 'url': video_url,
1972                 'uploader': video_uploader,
1973                 'upload_date': upload_date,
1974                 'title': video_title,
1975                 'ext': extension,
1976                 'format': format,
1977                 'thumbnail': thumbnail,
1978                 'description': video_description
1979             })
1980
1981         if self._downloader.params.get('listformats', None):
1982             self._print_formats(formats)
1983             return
1984
1985         req_format = self._downloader.params.get('format', None)
1986         self.to_screen(u'Format: %s' % req_format)
1987
1988         if req_format is None or req_format == 'best':
1989             return [formats[0]]
1990         elif req_format == 'worst':
1991             return [formats[-1]]
1992         elif req_format in ('-1', 'all'):
1993             return formats
1994         else:
1995             format = self._specific( req_format, formats )
1996             if result is None:
1997                 raise ExtractorError(u'Requested format not available')
1998             return [format]
1999
2000
2001
2002 class PornotubeIE(InfoExtractor):
2003     """Information extractor for pornotube.com."""
2004     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2005
2006     def _real_extract(self, url):
2007         mobj = re.match(self._VALID_URL, url)
2008         if mobj is None:
2009             raise ExtractorError(u'Invalid URL: %s' % url)
2010
2011         video_id = mobj.group('videoid')
2012         video_title = mobj.group('title')
2013
2014         # Get webpage content
2015         webpage = self._download_webpage(url, video_id)
2016
2017         # Get the video URL
2018         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2019         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2020         video_url = compat_urllib_parse.unquote(video_url)
2021
2022         #Get the uploaded date
2023         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2024         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2025         if upload_date: upload_date = unified_strdate(upload_date)
2026
2027         info = {'id': video_id,
2028                 'url': video_url,
2029                 'uploader': None,
2030                 'upload_date': upload_date,
2031                 'title': video_title,
2032                 'ext': 'flv',
2033                 'format': 'flv'}
2034
2035         return [info]
2036
2037 class YouJizzIE(InfoExtractor):
2038     """Information extractor for youjizz.com."""
2039     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2040
2041     def _real_extract(self, url):
2042         mobj = re.match(self._VALID_URL, url)
2043         if mobj is None:
2044             raise ExtractorError(u'Invalid URL: %s' % url)
2045
2046         video_id = mobj.group('videoid')
2047
2048         # Get webpage content
2049         webpage = self._download_webpage(url, video_id)
2050
2051         # Get the video title
2052         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2053             webpage, u'title').strip()
2054
2055         # Get the embed page
2056         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2057         if result is None:
2058             raise ExtractorError(u'ERROR: unable to extract embed page')
2059
2060         embed_page_url = result.group(0).strip()
2061         video_id = result.group('videoid')
2062
2063         webpage = self._download_webpage(embed_page_url, video_id)
2064
2065         # Get the video URL
2066         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2067             webpage, u'video URL')
2068
2069         info = {'id': video_id,
2070                 'url': video_url,
2071                 'title': video_title,
2072                 'ext': 'flv',
2073                 'format': 'flv',
2074                 'player_url': embed_page_url}
2075
2076         return [info]
2077
2078 class EightTracksIE(InfoExtractor):
2079     IE_NAME = '8tracks'
2080     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2081
2082     def _real_extract(self, url):
2083         mobj = re.match(self._VALID_URL, url)
2084         if mobj is None:
2085             raise ExtractorError(u'Invalid URL: %s' % url)
2086         playlist_id = mobj.group('id')
2087
2088         webpage = self._download_webpage(url, playlist_id)
2089
2090         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2091         data = json.loads(json_like)
2092
2093         session = str(random.randint(0, 1000000000))
2094         mix_id = data['id']
2095         track_count = data['tracks_count']
2096         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2097         next_url = first_url
2098         res = []
2099         for i in itertools.count():
2100             api_json = self._download_webpage(next_url, playlist_id,
2101                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2102                 errnote=u'Failed to download song information')
2103             api_data = json.loads(api_json)
2104             track_data = api_data[u'set']['track']
2105             info = {
2106                 'id': track_data['id'],
2107                 'url': track_data['track_file_stream_url'],
2108                 'title': track_data['performer'] + u' - ' + track_data['name'],
2109                 'raw_title': track_data['name'],
2110                 'uploader_id': data['user']['login'],
2111                 'ext': 'm4a',
2112             }
2113             res.append(info)
2114             if api_data['set']['at_last_track']:
2115                 break
2116             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2117         return res
2118
2119 class KeekIE(InfoExtractor):
2120     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2121     IE_NAME = u'keek'
2122
2123     def _real_extract(self, url):
2124         m = re.match(self._VALID_URL, url)
2125         video_id = m.group('videoID')
2126
2127         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2128         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2129         webpage = self._download_webpage(url, video_id)
2130
2131         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2132             webpage, u'title')
2133
2134         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2135             webpage, u'uploader', fatal=False)
2136
2137         info = {
2138                 'id': video_id,
2139                 'url': video_url,
2140                 'ext': 'mp4',
2141                 'title': video_title,
2142                 'thumbnail': thumbnail,
2143                 'uploader': uploader
2144         }
2145         return [info]
2146
2147 class TEDIE(InfoExtractor):
2148     _VALID_URL=r'''http://www\.ted\.com/
2149                    (
2150                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2151                         |
2152                         ((?P<type_talk>talks)) # We have a simple talk
2153                    )
2154                    (/lang/(.*?))? # The url may contain the language
2155                    /(?P<name>\w+) # Here goes the name and then ".html"
2156                    '''
2157
2158     @classmethod
2159     def suitable(cls, url):
2160         """Receives a URL and returns True if suitable for this IE."""
2161         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2162
2163     def _real_extract(self, url):
2164         m=re.match(self._VALID_URL, url, re.VERBOSE)
2165         if m.group('type_talk'):
2166             return [self._talk_info(url)]
2167         else :
2168             playlist_id=m.group('playlist_id')
2169             name=m.group('name')
2170             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2171             return [self._playlist_videos_info(url,name,playlist_id)]
2172
2173     def _playlist_videos_info(self,url,name,playlist_id=0):
2174         '''Returns the videos of the playlist'''
2175         video_RE=r'''
2176                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2177                      ([.\s]*?)data-playlist_item_id="(\d+)"
2178                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2179                      '''
2180         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2181         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2182         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2183         m_names=re.finditer(video_name_RE,webpage)
2184
2185         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2186                                                  webpage, 'playlist title')
2187
2188         playlist_entries = []
2189         for m_video, m_name in zip(m_videos,m_names):
2190             video_id=m_video.group('video_id')
2191             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2192             playlist_entries.append(self.url_result(talk_url, 'TED'))
2193         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2194
2195     def _talk_info(self, url, video_id=0):
2196         """Return the video for the talk in the url"""
2197         m = re.match(self._VALID_URL, url,re.VERBOSE)
2198         video_name = m.group('name')
2199         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2200         self.report_extraction(video_name)
2201         # If the url includes the language we get the title translated
2202         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2203                                         webpage, 'title')
2204         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2205                                     webpage, 'json data')
2206         info = json.loads(json_data)
2207         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2208                                        webpage, 'description', flags = re.DOTALL)
2209
2210         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2211                                        webpage, 'thumbnail')
2212         info = {
2213                 'id': info['id'],
2214                 'url': info['htmlStreams'][-1]['file'],
2215                 'ext': 'mp4',
2216                 'title': title,
2217                 'thumbnail': thumbnail,
2218                 'description': desc,
2219                 }
2220         return info
2221
2222 class MySpassIE(InfoExtractor):
2223     _VALID_URL = r'http://www.myspass.de/.*'
2224
2225     def _real_extract(self, url):
2226         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2227
2228         # video id is the last path element of the URL
2229         # usually there is a trailing slash, so also try the second but last
2230         url_path = compat_urllib_parse_urlparse(url).path
2231         url_parent_path, video_id = os.path.split(url_path)
2232         if not video_id:
2233             _, video_id = os.path.split(url_parent_path)
2234
2235         # get metadata
2236         metadata_url = META_DATA_URL_TEMPLATE % video_id
2237         metadata_text = self._download_webpage(metadata_url, video_id)
2238         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2239
2240         # extract values from metadata
2241         url_flv_el = metadata.find('url_flv')
2242         if url_flv_el is None:
2243             raise ExtractorError(u'Unable to extract download url')
2244         video_url = url_flv_el.text
2245         extension = os.path.splitext(video_url)[1][1:]
2246         title_el = metadata.find('title')
2247         if title_el is None:
2248             raise ExtractorError(u'Unable to extract title')
2249         title = title_el.text
2250         format_id_el = metadata.find('format_id')
2251         if format_id_el is None:
2252             format = ext
2253         else:
2254             format = format_id_el.text
2255         description_el = metadata.find('description')
2256         if description_el is not None:
2257             description = description_el.text
2258         else:
2259             description = None
2260         imagePreview_el = metadata.find('imagePreview')
2261         if imagePreview_el is not None:
2262             thumbnail = imagePreview_el.text
2263         else:
2264             thumbnail = None
2265         info = {
2266             'id': video_id,
2267             'url': video_url,
2268             'title': title,
2269             'ext': extension,
2270             'format': format,
2271             'thumbnail': thumbnail,
2272             'description': description
2273         }
2274         return [info]
2275
2276 class SpiegelIE(InfoExtractor):
2277     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2278
2279     def _real_extract(self, url):
2280         m = re.match(self._VALID_URL, url)
2281         video_id = m.group('videoID')
2282
2283         webpage = self._download_webpage(url, video_id)
2284
2285         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2286             webpage, u'title')
2287
2288         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2289         xml_code = self._download_webpage(xml_url, video_id,
2290                     note=u'Downloading XML', errnote=u'Failed to download XML')
2291
2292         idoc = xml.etree.ElementTree.fromstring(xml_code)
2293         last_type = idoc[-1]
2294         filename = last_type.findall('./filename')[0].text
2295         duration = float(last_type.findall('./duration')[0].text)
2296
2297         video_url = 'http://video2.spiegel.de/flash/' + filename
2298         video_ext = filename.rpartition('.')[2]
2299         info = {
2300             'id': video_id,
2301             'url': video_url,
2302             'ext': video_ext,
2303             'title': video_title,
2304             'duration': duration,
2305         }
2306         return [info]
2307
2308 class LiveLeakIE(InfoExtractor):
2309
2310     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2311     IE_NAME = u'liveleak'
2312
2313     def _real_extract(self, url):
2314         mobj = re.match(self._VALID_URL, url)
2315         if mobj is None:
2316             raise ExtractorError(u'Invalid URL: %s' % url)
2317
2318         video_id = mobj.group('video_id')
2319
2320         webpage = self._download_webpage(url, video_id)
2321
2322         video_url = self._search_regex(r'file: "(.*?)",',
2323             webpage, u'video URL')
2324
2325         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2326             webpage, u'title').replace('LiveLeak.com -', '').strip()
2327
2328         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2329             webpage, u'description', fatal=False)
2330
2331         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2332             webpage, u'uploader', fatal=False)
2333
2334         info = {
2335             'id':  video_id,
2336             'url': video_url,
2337             'ext': 'mp4',
2338             'title': video_title,
2339             'description': video_description,
2340             'uploader': video_uploader
2341         }
2342
2343         return [info]
2344
2345
2346
2347 class TumblrIE(InfoExtractor):
2348     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2349
2350     def _real_extract(self, url):
2351         m_url = re.match(self._VALID_URL, url)
2352         video_id = m_url.group('id')
2353         blog = m_url.group('blog_name')
2354
2355         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2356         webpage = self._download_webpage(url, video_id)
2357
2358         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2359         video = re.search(re_video, webpage)
2360         if video is None:
2361            raise ExtractorError(u'Unable to extract video')
2362         video_url = video.group('video_url')
2363         ext = video.group('ext')
2364
2365         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2366             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2367         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2368
2369         # The only place where you can get a title, it's not complete,
2370         # but searching in other places doesn't work for all videos
2371         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2372             webpage, u'title', flags=re.DOTALL)
2373
2374         return [{'id': video_id,
2375                  'url': video_url,
2376                  'title': video_title,
2377                  'thumbnail': video_thumbnail,
2378                  'ext': ext
2379                  }]
2380
2381 class BandcampIE(InfoExtractor):
2382     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2383
2384     def _real_extract(self, url):
2385         mobj = re.match(self._VALID_URL, url)
2386         title = mobj.group('title')
2387         webpage = self._download_webpage(url, title)
2388         # We get the link to the free download page
2389         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2390         if m_download is None:
2391             raise ExtractorError(u'No free songs found')
2392
2393         download_link = m_download.group(1)
2394         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2395                        webpage, re.MULTILINE|re.DOTALL).group('id')
2396
2397         download_webpage = self._download_webpage(download_link, id,
2398                                                   'Downloading free downloads page')
2399         # We get the dictionary of the track from some javascrip code
2400         info = re.search(r'items: (.*?),$',
2401                          download_webpage, re.MULTILINE).group(1)
2402         info = json.loads(info)[0]
2403         # We pick mp3-320 for now, until format selection can be easily implemented.
2404         mp3_info = info[u'downloads'][u'mp3-320']
2405         # If we try to use this url it says the link has expired
2406         initial_url = mp3_info[u'url']
2407         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2408         m_url = re.match(re_url, initial_url)
2409         #We build the url we will use to get the final track url
2410         # This url is build in Bandcamp in the script download_bunde_*.js
2411         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2412         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2413         # If we could correctly generate the .rand field the url would be
2414         #in the "download_url" key
2415         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2416
2417         track_info = {'id':id,
2418                       'title' : info[u'title'],
2419                       'ext' :   'mp3',
2420                       'url' :   final_url,
2421                       'thumbnail' : info[u'thumb_url'],
2422                       'uploader' :  info[u'artist']
2423                       }
2424
2425         return [track_info]
2426
2427 class RedTubeIE(InfoExtractor):
2428     """Information Extractor for redtube"""
2429     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2430
2431     def _real_extract(self,url):
2432         mobj = re.match(self._VALID_URL, url)
2433         if mobj is None:
2434             raise ExtractorError(u'Invalid URL: %s' % url)
2435
2436         video_id = mobj.group('id')
2437         video_extension = 'mp4'
2438         webpage = self._download_webpage(url, video_id)
2439
2440         self.report_extraction(video_id)
2441
2442         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2443             webpage, u'video URL')
2444
2445         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2446             webpage, u'title')
2447
2448         return [{
2449             'id':       video_id,
2450             'url':      video_url,
2451             'ext':      video_extension,
2452             'title':    video_title,
2453         }]
2454
2455 class InaIE(InfoExtractor):
2456     """Information Extractor for Ina.fr"""
2457     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2458
2459     def _real_extract(self,url):
2460         mobj = re.match(self._VALID_URL, url)
2461
2462         video_id = mobj.group('id')
2463         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2464         video_extension = 'mp4'
2465         webpage = self._download_webpage(mrss_url, video_id)
2466
2467         self.report_extraction(video_id)
2468
2469         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2470             webpage, u'video URL')
2471
2472         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2473             webpage, u'title')
2474
2475         return [{
2476             'id':       video_id,
2477             'url':      video_url,
2478             'ext':      video_extension,
2479             'title':    video_title,
2480         }]
2481
2482 class HowcastIE(InfoExtractor):
2483     """Information Extractor for Howcast.com"""
2484     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2485
2486     def _real_extract(self, url):
2487         mobj = re.match(self._VALID_URL, url)
2488
2489         video_id = mobj.group('id')
2490         webpage_url = 'http://www.howcast.com/videos/' + video_id
2491         webpage = self._download_webpage(webpage_url, video_id)
2492
2493         self.report_extraction(video_id)
2494
2495         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2496             webpage, u'video URL')
2497
2498         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2499             webpage, u'title')
2500
2501         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2502             webpage, u'description', fatal=False)
2503
2504         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2505             webpage, u'thumbnail', fatal=False)
2506
2507         return [{
2508             'id':       video_id,
2509             'url':      video_url,
2510             'ext':      'mp4',
2511             'title':    video_title,
2512             'description': video_description,
2513             'thumbnail': thumbnail,
2514         }]
2515
2516 class VineIE(InfoExtractor):
2517     """Information Extractor for Vine.co"""
2518     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2519
2520     def _real_extract(self, url):
2521         mobj = re.match(self._VALID_URL, url)
2522
2523         video_id = mobj.group('id')
2524         webpage_url = 'https://vine.co/v/' + video_id
2525         webpage = self._download_webpage(webpage_url, video_id)
2526
2527         self.report_extraction(video_id)
2528
2529         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2530             webpage, u'video URL')
2531
2532         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2533             webpage, u'title')
2534
2535         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2536             webpage, u'thumbnail', fatal=False)
2537
2538         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2539             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2540
2541         return [{
2542             'id':        video_id,
2543             'url':       video_url,
2544             'ext':       'mp4',
2545             'title':     video_title,
2546             'thumbnail': thumbnail,
2547             'uploader':  uploader,
2548         }]
2549
2550 class FlickrIE(InfoExtractor):
2551     """Information Extractor for Flickr videos"""
2552     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2553
2554     def _real_extract(self, url):
2555         mobj = re.match(self._VALID_URL, url)
2556
2557         video_id = mobj.group('id')
2558         video_uploader_id = mobj.group('uploader_id')
2559         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2560         webpage = self._download_webpage(webpage_url, video_id)
2561
2562         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2563
2564         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2565         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2566
2567         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2568             first_xml, u'node_id')
2569
2570         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2571         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2572
2573         self.report_extraction(video_id)
2574
2575         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2576         if mobj is None:
2577             raise ExtractorError(u'Unable to extract video url')
2578         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2579
2580         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2581             webpage, u'video title')
2582
2583         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2584             webpage, u'description', fatal=False)
2585
2586         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2587             webpage, u'thumbnail', fatal=False)
2588
2589         return [{
2590             'id':          video_id,
2591             'url':         video_url,
2592             'ext':         'mp4',
2593             'title':       video_title,
2594             'description': video_description,
2595             'thumbnail':   thumbnail,
2596             'uploader_id': video_uploader_id,
2597         }]
2598
2599 class TeamcocoIE(InfoExtractor):
2600     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2601
2602     def _real_extract(self, url):
2603         mobj = re.match(self._VALID_URL, url)
2604         if mobj is None:
2605             raise ExtractorError(u'Invalid URL: %s' % url)
2606         url_title = mobj.group('url_title')
2607         webpage = self._download_webpage(url, url_title)
2608
2609         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2610             webpage, u'video id')
2611
2612         self.report_extraction(video_id)
2613
2614         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2615             webpage, u'title')
2616
2617         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2618             webpage, u'thumbnail', fatal=False)
2619
2620         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2621             webpage, u'description', fatal=False)
2622
2623         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2624         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2625
2626         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2627             data, u'video URL')
2628
2629         return [{
2630             'id':          video_id,
2631             'url':         video_url,
2632             'ext':         'mp4',
2633             'title':       video_title,
2634             'thumbnail':   thumbnail,
2635             'description': video_description,
2636         }]
2637
2638 class XHamsterIE(InfoExtractor):
2639     """Information Extractor for xHamster"""
2640     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2641
2642     def _real_extract(self,url):
2643         mobj = re.match(self._VALID_URL, url)
2644
2645         video_id = mobj.group('id')
2646         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2647         webpage = self._download_webpage(mrss_url, video_id)
2648
2649         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2650         if mobj is None:
2651             raise ExtractorError(u'Unable to extract media URL')
2652         if len(mobj.group('server')) == 0:
2653             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2654         else:
2655             video_url = mobj.group('server')+'/key='+mobj.group('file')
2656         video_extension = video_url.split('.')[-1]
2657
2658         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2659             webpage, u'title')
2660
2661         # Can't see the description anywhere in the UI
2662         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2663         #     webpage, u'description', fatal=False)
2664         # if video_description: video_description = unescapeHTML(video_description)
2665
2666         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2667         if mobj:
2668             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2669         else:
2670             video_upload_date = None
2671             self._downloader.report_warning(u'Unable to extract upload date')
2672
2673         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2674             webpage, u'uploader id', default=u'anonymous')
2675
2676         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2677             webpage, u'thumbnail', fatal=False)
2678
2679         return [{
2680             'id':       video_id,
2681             'url':      video_url,
2682             'ext':      video_extension,
2683             'title':    video_title,
2684             # 'description': video_description,
2685             'upload_date': video_upload_date,
2686             'uploader_id': video_uploader_id,
2687             'thumbnail': video_thumbnail
2688         }]
2689
2690 class HypemIE(InfoExtractor):
2691     """Information Extractor for hypem"""
2692     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2693
2694     def _real_extract(self, url):
2695         mobj = re.match(self._VALID_URL, url)
2696         if mobj is None:
2697             raise ExtractorError(u'Invalid URL: %s' % url)
2698         track_id = mobj.group(1)
2699
2700         data = { 'ax': 1, 'ts': time.time() }
2701         data_encoded = compat_urllib_parse.urlencode(data)
2702         complete_url = url + "?" + data_encoded
2703         request = compat_urllib_request.Request(complete_url)
2704         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2705         cookie = urlh.headers.get('Set-Cookie', '')
2706
2707         self.report_extraction(track_id)
2708
2709         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2710             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2711         try:
2712             track_list = json.loads(html_tracks)
2713             track = track_list[u'tracks'][0]
2714         except ValueError:
2715             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2716
2717         key = track[u"key"]
2718         track_id = track[u"id"]
2719         artist = track[u"artist"]
2720         title = track[u"song"]
2721
2722         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2723         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2724         request.add_header('cookie', cookie)
2725         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2726         try:
2727             song_data = json.loads(song_data_json)
2728         except ValueError:
2729             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2730         final_url = song_data[u"url"]
2731
2732         return [{
2733             'id':       track_id,
2734             'url':      final_url,
2735             'ext':      "mp3",
2736             'title':    title,
2737             'artist':   artist,
2738         }]
2739
2740 class Vbox7IE(InfoExtractor):
2741     """Information Extractor for Vbox7"""
2742     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2743
2744     def _real_extract(self,url):
2745         mobj = re.match(self._VALID_URL, url)
2746         if mobj is None:
2747             raise ExtractorError(u'Invalid URL: %s' % url)
2748         video_id = mobj.group(1)
2749
2750         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2751         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2752         redirect_url = urlh.geturl() + new_location
2753         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2754
2755         title = self._html_search_regex(r'<title>(.*)</title>',
2756             webpage, u'title').split('/')[0].strip()
2757
2758         ext = "flv"
2759         info_url = "http://vbox7.com/play/magare.do"
2760         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2761         info_request = compat_urllib_request.Request(info_url, data)
2762         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2763         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2764         if info_response is None:
2765             raise ExtractorError(u'Unable to extract the media url')
2766         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2767
2768         return [{
2769             'id':        video_id,
2770             'url':       final_url,
2771             'ext':       ext,
2772             'title':     title,
2773             'thumbnail': thumbnail_url,
2774         }]
2775
2776
2777 def gen_extractors():
2778     """ Return a list of an instance of every supported extractor.
2779     The order does matter; the first extractor matched is the one handling the URL.
2780     """
2781     return [
2782         YoutubePlaylistIE(),
2783         YoutubeChannelIE(),
2784         YoutubeUserIE(),
2785         YoutubeSearchIE(),
2786         YoutubeIE(),
2787         MetacafeIE(),
2788         DailymotionIE(),
2789         GoogleSearchIE(),
2790         PhotobucketIE(),
2791         YahooIE(),
2792         YahooSearchIE(),
2793         DepositFilesIE(),
2794         FacebookIE(),
2795         BlipTVIE(),
2796         BlipTVUserIE(),
2797         VimeoIE(),
2798         MyVideoIE(),
2799         ComedyCentralIE(),
2800         EscapistIE(),
2801         CollegeHumorIE(),
2802         XVideosIE(),
2803         SoundcloudSetIE(),
2804         SoundcloudIE(),
2805         InfoQIE(),
2806         MixcloudIE(),
2807         StanfordOpenClassroomIE(),
2808         MTVIE(),
2809         YoukuIE(),
2810         XNXXIE(),
2811         YouJizzIE(),
2812         PornotubeIE(),
2813         YouPornIE(),
2814         GooglePlusIE(),
2815         ArteTvIE(),
2816         NBAIE(),
2817         WorldStarHipHopIE(),
2818         JustinTVIE(),
2819         FunnyOrDieIE(),
2820         SteamIE(),
2821         UstreamIE(),
2822         RBMARadioIE(),
2823         EightTracksIE(),
2824         KeekIE(),
2825         TEDIE(),
2826         MySpassIE(),
2827         SpiegelIE(),
2828         LiveLeakIE(),
2829         ARDIE(),
2830         ZDFIE(),
2831         TumblrIE(),
2832         BandcampIE(),
2833         RedTubeIE(),
2834         InaIE(),
2835         HowcastIE(),
2836         VineIE(),
2837         FlickrIE(),
2838         TeamcocoIE(),
2839         XHamsterIE(),
2840         HypemIE(),
2841         Vbox7IE(),
2842         GametrailersIE(),
2843         StatigramIE(),
2844         GenericIE()
2845     ]
2846
2847 def get_info_extractor(ie_name):
2848     """Returns the info extractor class with the given ie_name"""
2849     return globals()[ie_name+'IE']