youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.dailymotion import DailymotionIE
  24 from .extractor.gametrailers import GametrailersIE
  25 from .extractor.generic import GenericIE
  26 from .extractor.metacafe import MetacafeIE
  27 from .extractor.statigram import StatigramIE
  28 from .extractor.photobucket import PhotobucketIE
  29 from .extractor.vimeo import VimeoIE
  30 from .extractor.yahoo import YahooIE
  31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  32 from .extractor.zdf import ZDFIE
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48 class YahooSearchIE(SearchInfoExtractor):
  49     """Information Extractor for Yahoo! Video search queries."""
  50
  51     _MAX_RESULTS = 1000
  52     IE_NAME = u'screen.yahoo:search'
  53     _SEARCH_KEY = 'yvsearch'
  54
  55     def _get_n_results(self, query, n):
  56         """Get a specified number of results for a query"""
  57
  58         res = {
  59             '_type': 'playlist',
  60             'id': query,
  61             'entries': []
  62         }
  63         for pagenum in itertools.count(0):
  64             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  65             webpage = self._download_webpage(result_url, query,
  66                                              note='Downloading results page '+str(pagenum+1))
  67             info = json.loads(webpage)
  68             m = info[u'm']
  69             results = info[u'results']
  70
  71             for (i, r) in enumerate(results):
  72                 if (pagenum * 30) +i >= n:
  73                     break
  74                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  75                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  76                 res['entries'].append(e)
  77             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
  78                 break
  79
  80         return res
  81
  82
  83 class BlipTVUserIE(InfoExtractor):
  84     """Information Extractor for blip.tv users."""
  85
  86     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
  87     _PAGE_SIZE = 12
  88     IE_NAME = u'blip.tv:user'
  89
  90     def _real_extract(self, url):
  91         # Extract username
  92         mobj = re.match(self._VALID_URL, url)
  93         if mobj is None:
  94             raise ExtractorError(u'Invalid URL: %s' % url)
  95
  96         username = mobj.group(1)
  97
  98         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
  99
 100         page = self._download_webpage(url, username, u'Downloading user page')
 101         mobj = re.search(r'data-users-id="([^"]+)"', page)
 102         page_base = page_base % mobj.group(1)
 103
 104
 105         # Download video ids using BlipTV Ajax calls. Result size per
 106         # query is limited (currently to 12 videos) so we need to query
 107         # page by page until there are no video ids - it means we got
 108         # all of them.
 109
 110         video_ids = []
 111         pagenum = 1
 112
 113         while True:
 114             url = page_base + "&page=" + str(pagenum)
 115             page = self._download_webpage(url, username,
 116                                           u'Downloading video ids from page %d' % pagenum)
 117
 118             # Extract video identifiers
 119             ids_in_page = []
 120
 121             for mobj in re.finditer(r'href="/([^"]+)"', page):
 122                 if mobj.group(1) not in ids_in_page:
 123                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 124
 125             video_ids.extend(ids_in_page)
 126
 127             # A little optimization - if current page is not
 128             # "full", ie. does not contain PAGE_SIZE video ids then
 129             # we can assume that this page is the last one - there
 130             # are no more ids on further pages - no need to query
 131             # again.
 132
 133             if len(ids_in_page) < self._PAGE_SIZE:
 134                 break
 135
 136             pagenum += 1
 137
 138         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 139         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 140         return [self.playlist_result(url_entries, playlist_title = username)]
 141
 142
 143 class DepositFilesIE(InfoExtractor):
 144     """Information extractor for depositfiles.com"""
 145
 146     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 147
 148     def _real_extract(self, url):
 149         file_id = url.split('/')[-1]
 150         # Rebuild url in english locale
 151         url = 'http://depositfiles.com/en/files/' + file_id
 152
 153         # Retrieve file webpage with 'Free download' button pressed
 154         free_download_indication = { 'gateway_result' : '1' }
 155         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 156         try:
 157             self.report_download_webpage(file_id)
 158             webpage = compat_urllib_request.urlopen(request).read()
 159         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 160             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 161
 162         # Search for the real file URL
 163         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 164         if (mobj is None) or (mobj.group(1) is None):
 165             # Try to figure out reason of the error.
 166             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 167             if (mobj is not None) and (mobj.group(1) is not None):
 168                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 169                 raise ExtractorError(u'%s' % restriction_message)
 170             else:
 171                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 172
 173         file_url = mobj.group(1)
 174         file_extension = os.path.splitext(file_url)[1][1:]
 175
 176         # Search for file title
 177         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 178
 179         return [{
 180             'id':       file_id.decode('utf-8'),
 181             'url':      file_url.decode('utf-8'),
 182             'uploader': None,
 183             'upload_date':  None,
 184             'title':    file_title,
 185             'ext':      file_extension.decode('utf-8'),
 186         }]
 187
 188
 189 class FacebookIE(InfoExtractor):
 190     """Information Extractor for Facebook"""
 191
 192     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 193     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 194     _NETRC_MACHINE = 'facebook'
 195     IE_NAME = u'facebook'
 196
 197     def report_login(self):
 198         """Report attempt to log in."""
 199         self.to_screen(u'Logging in')
 200
 201     def _real_initialize(self):
 202         if self._downloader is None:
 203             return
 204
 205         useremail = None
 206         password = None
 207         downloader_params = self._downloader.params
 208
 209         # Attempt to use provided username and password or .netrc data
 210         if downloader_params.get('username', None) is not None:
 211             useremail = downloader_params['username']
 212             password = downloader_params['password']
 213         elif downloader_params.get('usenetrc', False):
 214             try:
 215                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 216                 if info is not None:
 217                     useremail = info[0]
 218                     password = info[2]
 219                 else:
 220                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 221             except (IOError, netrc.NetrcParseError) as err:
 222                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 223                 return
 224
 225         if useremail is None:
 226             return
 227
 228         # Log in
 229         login_form = {
 230             'email': useremail,
 231             'pass': password,
 232             'login': 'Log+In'
 233             }
 234         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 235         try:
 236             self.report_login()
 237             login_results = compat_urllib_request.urlopen(request).read()
 238             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 239                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 240                 return
 241         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 242             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 243             return
 244
 245     def _real_extract(self, url):
 246         mobj = re.match(self._VALID_URL, url)
 247         if mobj is None:
 248             raise ExtractorError(u'Invalid URL: %s' % url)
 249         video_id = mobj.group('ID')
 250
 251         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 252         webpage = self._download_webpage(url, video_id)
 253
 254         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 255         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 256         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 257         if not m:
 258             raise ExtractorError(u'Cannot parse data')
 259         data = dict(json.loads(m.group(1)))
 260         params_raw = compat_urllib_parse.unquote(data['params'])
 261         params = json.loads(params_raw)
 262         video_data = params['video_data'][0]
 263         video_url = video_data.get('hd_src')
 264         if not video_url:
 265             video_url = video_data['sd_src']
 266         if not video_url:
 267             raise ExtractorError(u'Cannot find video URL')
 268         video_duration = int(video_data['video_duration'])
 269         thumbnail = video_data['thumbnail_src']
 270
 271         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 272             webpage, u'title')
 273
 274         info = {
 275             'id': video_id,
 276             'title': video_title,
 277             'url': video_url,
 278             'ext': 'mp4',
 279             'duration': video_duration,
 280             'thumbnail': thumbnail,
 281         }
 282         return [info]
 283
 284
 285 class BlipTVIE(InfoExtractor):
 286     """Information extractor for blip.tv"""
 287
 288     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 289     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 290     IE_NAME = u'blip.tv'
 291
 292     def report_direct_download(self, title):
 293         """Report information extraction."""
 294         self.to_screen(u'%s: Direct download detected' % title)
 295
 296     def _real_extract(self, url):
 297         mobj = re.match(self._VALID_URL, url)
 298         if mobj is None:
 299             raise ExtractorError(u'Invalid URL: %s' % url)
 300
 301         # See https://github.com/rg3/youtube-dl/issues/857
 302         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 303         if api_mobj is not None:
 304             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 305         urlp = compat_urllib_parse_urlparse(url)
 306         if urlp.path.startswith('/play/'):
 307             request = compat_urllib_request.Request(url)
 308             response = compat_urllib_request.urlopen(request)
 309             redirecturl = response.geturl()
 310             rurlp = compat_urllib_parse_urlparse(redirecturl)
 311             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 312             url = 'http://blip.tv/a/a-' + file_id
 313             return self._real_extract(url)
 314
 315
 316         if '?' in url:
 317             cchar = '&'
 318         else:
 319             cchar = '?'
 320         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 321         request = compat_urllib_request.Request(json_url)
 322         request.add_header('User-Agent', 'iTunes/10.6.1')
 323         self.report_extraction(mobj.group(1))
 324         info = None
 325         try:
 326             urlh = compat_urllib_request.urlopen(request)
 327             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 328                 basename = url.split('/')[-1]
 329                 title,ext = os.path.splitext(basename)
 330                 title = title.decode('UTF-8')
 331                 ext = ext.replace('.', '')
 332                 self.report_direct_download(title)
 333                 info = {
 334                     'id': title,
 335                     'url': url,
 336                     'uploader': None,
 337                     'upload_date': None,
 338                     'title': title,
 339                     'ext': ext,
 340                     'urlhandle': urlh
 341                 }
 342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 343             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 344         if info is None: # Regular URL
 345             try:
 346                 json_code_bytes = urlh.read()
 347                 json_code = json_code_bytes.decode('utf-8')
 348             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 349                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 350
 351             try:
 352                 json_data = json.loads(json_code)
 353                 if 'Post' in json_data:
 354                     data = json_data['Post']
 355                 else:
 356                     data = json_data
 357
 358                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 359                 video_url = data['media']['url']
 360                 umobj = re.match(self._URL_EXT, video_url)
 361                 if umobj is None:
 362                     raise ValueError('Can not determine filename extension')
 363                 ext = umobj.group(1)
 364
 365                 info = {
 366                     'id': data['item_id'],
 367                     'url': video_url,
 368                     'uploader': data['display_name'],
 369                     'upload_date': upload_date,
 370                     'title': data['title'],
 371                     'ext': ext,
 372                     'format': data['media']['mimeType'],
 373                     'thumbnail': data['thumbnailUrl'],
 374                     'description': data['description'],
 375                     'player_url': data['embedUrl'],
 376                     'user_agent': 'iTunes/10.6.1',
 377                 }
 378             except (ValueError,KeyError) as err:
 379                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 380
 381         return [info]
 382
 383
 384 class MyVideoIE(InfoExtractor):
 385     """Information Extractor for myvideo.de."""
 386
 387     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 388     IE_NAME = u'myvideo'
 389
 390     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 391     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 392     # https://github.com/rg3/youtube-dl/pull/842
 393     def __rc4crypt(self,data, key):
 394         x = 0
 395         box = list(range(256))
 396         for i in list(range(256)):
 397             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 398             box[i], box[x] = box[x], box[i]
 399         x = 0
 400         y = 0
 401         out = ''
 402         for char in data:
 403             x = (x + 1) % 256
 404             y = (y + box[x]) % 256
 405             box[x], box[y] = box[y], box[x]
 406             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 407         return out
 408
 409     def __md5(self,s):
 410         return hashlib.md5(s).hexdigest().encode()
 411
 412     def _real_extract(self,url):
 413         mobj = re.match(self._VALID_URL, url)
 414         if mobj is None:
 415             raise ExtractorError(u'invalid URL: %s' % url)
 416
 417         video_id = mobj.group(1)
 418
 419         GK = (
 420           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 421           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 422           b'TnpsbA0KTVRkbU1tSTRNdz09'
 423         )
 424
 425         # Get video webpage
 426         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 427         webpage = self._download_webpage(webpage_url, video_id)
 428
 429         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 430         if mobj is not None:
 431             self.report_extraction(video_id)
 432             video_url = mobj.group(1) + '.flv'
 433
 434             video_title = self._html_search_regex('<title>([^<]+)</title>',
 435                 webpage, u'title')
 436
 437             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 438
 439             return [{
 440                 'id':       video_id,
 441                 'url':      video_url,
 442                 'uploader': None,
 443                 'upload_date':  None,
 444                 'title':    video_title,
 445                 'ext':      u'flv',
 446             }]
 447
 448         # try encxml
 449         mobj = re.search('var flashvars={(.+?)}', webpage)
 450         if mobj is None:
 451             raise ExtractorError(u'Unable to extract video')
 452
 453         params = {}
 454         encxml = ''
 455         sec = mobj.group(1)
 456         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 457             if not a == '_encxml':
 458                 params[a] = b
 459             else:
 460                 encxml = compat_urllib_parse.unquote(b)
 461         if not params.get('domain'):
 462             params['domain'] = 'www.myvideo.de'
 463         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 464         if 'flash_playertype=MTV' in xmldata_url:
 465             self._downloader.report_warning(u'avoiding MTV player')
 466             xmldata_url = (
 467                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 468                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 469             ) % video_id
 470
 471         # get enc data
 472         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 473         enc_data_b = binascii.unhexlify(enc_data)
 474         sk = self.__md5(
 475             base64.b64decode(base64.b64decode(GK)) +
 476             self.__md5(
 477                 str(video_id).encode('utf-8')
 478             )
 479         )
 480         dec_data = self.__rc4crypt(enc_data_b, sk)
 481
 482         # extracting infos
 483         self.report_extraction(video_id)
 484
 485         video_url = None
 486         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 487         if mobj:
 488             video_url = compat_urllib_parse.unquote(mobj.group(1))
 489             if 'myvideo2flash' in video_url:
 490                 self._downloader.report_warning(u'forcing RTMPT ...')
 491                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 492
 493         if not video_url:
 494             # extract non rtmp videos
 495             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 496             if mobj is None:
 497                 raise ExtractorError(u'unable to extract url')
 498             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 499
 500         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 501         video_file = compat_urllib_parse.unquote(video_file)
 502
 503         if not video_file.endswith('f4m'):
 504             ppath, prefix = video_file.split('.')
 505             video_playpath = '%s:%s' % (prefix, ppath)
 506             video_hls_playlist = ''
 507         else:
 508             video_playpath = ''
 509             video_hls_playlist = (
 510                 video_filepath + video_file
 511             ).replace('.f4m', '.m3u8')
 512
 513         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 514         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 515
 516         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 517             webpage, u'title')
 518
 519         return [{
 520             'id':                 video_id,
 521             'url':                video_url,
 522             'tc_url':             video_url,
 523             'uploader':           None,
 524             'upload_date':        None,
 525             'title':              video_title,
 526             'ext':                u'flv',
 527             'play_path':          video_playpath,
 528             'video_file':         video_file,
 529             'video_hls_playlist': video_hls_playlist,
 530             'player_url':         video_swfobj,
 531         }]
 532
 533
 534 class ComedyCentralIE(InfoExtractor):
 535     """Information extractor for The Daily Show and Colbert Report """
 536
 537     # urls can be abbreviations like :thedailyshow or :colbert
 538     # urls for episodes like:
 539     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 540     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 541     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 542     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 543                       |(https?://)?(www\.)?
 544                           (?P<showname>thedailyshow|colbertnation)\.com/
 545                          (full-episodes/(?P<episode>.*)|
 546                           (?P<clip>
 547                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 548                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 549                      $"""
 550
 551     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 552
 553     _video_extensions = {
 554         '3500': 'mp4',
 555         '2200': 'mp4',
 556         '1700': 'mp4',
 557         '1200': 'mp4',
 558         '750': 'mp4',
 559         '400': 'mp4',
 560     }
 561     _video_dimensions = {
 562         '3500': '1280x720',
 563         '2200': '960x540',
 564         '1700': '768x432',
 565         '1200': '640x360',
 566         '750': '512x288',
 567         '400': '384x216',
 568     }
 569
 570     @classmethod
 571     def suitable(cls, url):
 572         """Receives a URL and returns True if suitable for this IE."""
 573         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 574
 575     def _print_formats(self, formats):
 576         print('Available formats:')
 577         for x in formats:
 578             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 579
 580
 581     def _real_extract(self, url):
 582         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 583         if mobj is None:
 584             raise ExtractorError(u'Invalid URL: %s' % url)
 585
 586         if mobj.group('shortname'):
 587             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 588                 url = u'http://www.thedailyshow.com/full-episodes/'
 589             else:
 590                 url = u'http://www.colbertnation.com/full-episodes/'
 591             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 592             assert mobj is not None
 593
 594         if mobj.group('clip'):
 595             if mobj.group('showname') == 'thedailyshow':
 596                 epTitle = mobj.group('tdstitle')
 597             else:
 598                 epTitle = mobj.group('cntitle')
 599             dlNewest = False
 600         else:
 601             dlNewest = not mobj.group('episode')
 602             if dlNewest:
 603                 epTitle = mobj.group('showname')
 604             else:
 605                 epTitle = mobj.group('episode')
 606
 607         self.report_extraction(epTitle)
 608         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 609         if dlNewest:
 610             url = htmlHandle.geturl()
 611             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 612             if mobj is None:
 613                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 614             if mobj.group('episode') == '':
 615                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 616             epTitle = mobj.group('episode')
 617
 618         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 619
 620         if len(mMovieParams) == 0:
 621             # The Colbert Report embeds the information in a without
 622             # a URL prefix; so extract the alternate reference
 623             # and then add the URL prefix manually.
 624
 625             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 626             if len(altMovieParams) == 0:
 627                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 628             else:
 629                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 630
 631         uri = mMovieParams[0][1]
 632         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 633         indexXml = self._download_webpage(indexUrl, epTitle,
 634                                           u'Downloading show index',
 635                                           u'unable to download episode index')
 636
 637         results = []
 638
 639         idoc = xml.etree.ElementTree.fromstring(indexXml)
 640         itemEls = idoc.findall('.//item')
 641         for partNum,itemEl in enumerate(itemEls):
 642             mediaId = itemEl.findall('./guid')[0].text
 643             shortMediaId = mediaId.split(':')[-1]
 644             showId = mediaId.split(':')[-2].replace('.com', '')
 645             officialTitle = itemEl.findall('./title')[0].text
 646             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 647
 648             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 649                         compat_urllib_parse.urlencode({'uri': mediaId}))
 650             configXml = self._download_webpage(configUrl, epTitle,
 651                                                u'Downloading configuration for %s' % shortMediaId)
 652
 653             cdoc = xml.etree.ElementTree.fromstring(configXml)
 654             turls = []
 655             for rendition in cdoc.findall('.//rendition'):
 656                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 657                 turls.append(finfo)
 658
 659             if len(turls) == 0:
 660                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 661                 continue
 662
 663             if self._downloader.params.get('listformats', None):
 664                 self._print_formats([i[0] for i in turls])
 665                 return
 666
 667             # For now, just pick the highest bitrate
 668             format,rtmp_video_url = turls[-1]
 669
 670             # Get the format arg from the arg stream
 671             req_format = self._downloader.params.get('format', None)
 672
 673             # Select format if we can find one
 674             for f,v in turls:
 675                 if f == req_format:
 676                     format, rtmp_video_url = f, v
 677                     break
 678
 679             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 680             if not m:
 681                 raise ExtractorError(u'Cannot transform RTMP url')
 682             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 683             video_url = base + m.group('finalid')
 684
 685             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 686             info = {
 687                 'id': shortMediaId,
 688                 'url': video_url,
 689                 'uploader': showId,
 690                 'upload_date': officialDate,
 691                 'title': effTitle,
 692                 'ext': 'mp4',
 693                 'format': format,
 694                 'thumbnail': None,
 695                 'description': officialTitle,
 696             }
 697             results.append(info)
 698
 699         return results
 700
 701
 702 class EscapistIE(InfoExtractor):
 703     """Information extractor for The Escapist """
 704
 705     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 706     IE_NAME = u'escapist'
 707
 708     def _real_extract(self, url):
 709         mobj = re.match(self._VALID_URL, url)
 710         if mobj is None:
 711             raise ExtractorError(u'Invalid URL: %s' % url)
 712         showName = mobj.group('showname')
 713         videoId = mobj.group('episode')
 714
 715         self.report_extraction(videoId)
 716         webpage = self._download_webpage(url, videoId)
 717
 718         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 719             webpage, u'description', fatal=False)
 720
 721         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 722             webpage, u'thumbnail', fatal=False)
 723
 724         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 725             webpage, u'player url')
 726
 727         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 728             webpage, u'player url').split(' : ')[-1]
 729
 730         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 731         configUrl = compat_urllib_parse.unquote(configUrl)
 732
 733         configJSON = self._download_webpage(configUrl, videoId,
 734                                             u'Downloading configuration',
 735                                             u'unable to download configuration')
 736
 737         # Technically, it's JavaScript, not JSON
 738         configJSON = configJSON.replace("'", '"')
 739
 740         try:
 741             config = json.loads(configJSON)
 742         except (ValueError,) as err:
 743             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 744
 745         playlist = config['playlist']
 746         videoUrl = playlist[1]['url']
 747
 748         info = {
 749             'id': videoId,
 750             'url': videoUrl,
 751             'uploader': showName,
 752             'upload_date': None,
 753             'title': title,
 754             'ext': 'mp4',
 755             'thumbnail': imgUrl,
 756             'description': videoDesc,
 757             'player_url': playerUrl,
 758         }
 759
 760         return [info]
 761
 762 class CollegeHumorIE(InfoExtractor):
 763     """Information extractor for collegehumor.com"""
 764
 765     _WORKING = False
 766     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 767     IE_NAME = u'collegehumor'
 768
 769     def report_manifest(self, video_id):
 770         """Report information extraction."""
 771         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 772
 773     def _real_extract(self, url):
 774         mobj = re.match(self._VALID_URL, url)
 775         if mobj is None:
 776             raise ExtractorError(u'Invalid URL: %s' % url)
 777         video_id = mobj.group('videoid')
 778
 779         info = {
 780             'id': video_id,
 781             'uploader': None,
 782             'upload_date': None,
 783         }
 784
 785         self.report_extraction(video_id)
 786         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 787         try:
 788             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 789         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 790             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 791
 792         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 793         try:
 794             videoNode = mdoc.findall('./video')[0]
 795             info['description'] = videoNode.findall('./description')[0].text
 796             info['title'] = videoNode.findall('./caption')[0].text
 797             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 798             manifest_url = videoNode.findall('./file')[0].text
 799         except IndexError:
 800             raise ExtractorError(u'Invalid metadata XML file')
 801
 802         manifest_url += '?hdcore=2.10.3'
 803         self.report_manifest(video_id)
 804         try:
 805             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 806         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 807             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 808
 809         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 810         try:
 811             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 812             node_id = media_node.attrib['url']
 813             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 814         except IndexError as err:
 815             raise ExtractorError(u'Invalid manifest file')
 816
 817         url_pr = compat_urllib_parse_urlparse(manifest_url)
 818         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 819
 820         info['url'] = url
 821         info['ext'] = 'f4f'
 822         return [info]
 823
 824
 825 class XVideosIE(InfoExtractor):
 826     """Information extractor for xvideos.com"""
 827
 828     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 829     IE_NAME = u'xvideos'
 830
 831     def _real_extract(self, url):
 832         mobj = re.match(self._VALID_URL, url)
 833         if mobj is None:
 834             raise ExtractorError(u'Invalid URL: %s' % url)
 835         video_id = mobj.group(1)
 836
 837         webpage = self._download_webpage(url, video_id)
 838
 839         self.report_extraction(video_id)
 840
 841         # Extract video URL
 842         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 843             webpage, u'video URL'))
 844
 845         # Extract title
 846         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 847             webpage, u'title')
 848
 849         # Extract video thumbnail
 850         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 851             webpage, u'thumbnail', fatal=False)
 852
 853         info = {
 854             'id': video_id,
 855             'url': video_url,
 856             'uploader': None,
 857             'upload_date': None,
 858             'title': video_title,
 859             'ext': 'flv',
 860             'thumbnail': video_thumbnail,
 861             'description': None,
 862         }
 863
 864         return [info]
 865
 866
 867 class SoundcloudIE(InfoExtractor):
 868     """Information extractor for soundcloud.com
 869        To access the media, the uid of the song and a stream token
 870        must be extracted from the page source and the script must make
 871        a request to media.soundcloud.com/crossdomain.xml. Then
 872        the media can be grabbed by requesting from an url composed
 873        of the stream token and uid
 874      """
 875
 876     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 877     IE_NAME = u'soundcloud'
 878
 879     def report_resolve(self, video_id):
 880         """Report information extraction."""
 881         self.to_screen(u'%s: Resolving id' % video_id)
 882
 883     def _real_extract(self, url):
 884         mobj = re.match(self._VALID_URL, url)
 885         if mobj is None:
 886             raise ExtractorError(u'Invalid URL: %s' % url)
 887
 888         # extract uploader (which is in the url)
 889         uploader = mobj.group(1)
 890         # extract simple title (uploader + slug of song title)
 891         slug_title =  mobj.group(2)
 892         simple_title = uploader + u'-' + slug_title
 893         full_title = '%s/%s' % (uploader, slug_title)
 894
 895         self.report_resolve(full_title)
 896
 897         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 898         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 899         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 900
 901         info = json.loads(info_json)
 902         video_id = info['id']
 903         self.report_extraction(full_title)
 904
 905         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 906         stream_json = self._download_webpage(streams_url, full_title,
 907                                              u'Downloading stream definitions',
 908                                              u'unable to download stream definitions')
 909
 910         streams = json.loads(stream_json)
 911         mediaURL = streams['http_mp3_128_url']
 912         upload_date = unified_strdate(info['created_at'])
 913
 914         return [{
 915             'id':       info['id'],
 916             'url':      mediaURL,
 917             'uploader': info['user']['username'],
 918             'upload_date': upload_date,
 919             'title':    info['title'],
 920             'ext':      u'mp3',
 921             'description': info['description'],
 922         }]
 923
 924 class SoundcloudSetIE(InfoExtractor):
 925     """Information extractor for soundcloud.com sets
 926        To access the media, the uid of the song and a stream token
 927        must be extracted from the page source and the script must make
 928        a request to media.soundcloud.com/crossdomain.xml. Then
 929        the media can be grabbed by requesting from an url composed
 930        of the stream token and uid
 931      """
 932
 933     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 934     IE_NAME = u'soundcloud:set'
 935
 936     def report_resolve(self, video_id):
 937         """Report information extraction."""
 938         self.to_screen(u'%s: Resolving id' % video_id)
 939
 940     def _real_extract(self, url):
 941         mobj = re.match(self._VALID_URL, url)
 942         if mobj is None:
 943             raise ExtractorError(u'Invalid URL: %s' % url)
 944
 945         # extract uploader (which is in the url)
 946         uploader = mobj.group(1)
 947         # extract simple title (uploader + slug of song title)
 948         slug_title =  mobj.group(2)
 949         simple_title = uploader + u'-' + slug_title
 950         full_title = '%s/sets/%s' % (uploader, slug_title)
 951
 952         self.report_resolve(full_title)
 953
 954         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 955         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 956         info_json = self._download_webpage(resolv_url, full_title)
 957
 958         videos = []
 959         info = json.loads(info_json)
 960         if 'errors' in info:
 961             for err in info['errors']:
 962                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 963             return
 964
 965         self.report_extraction(full_title)
 966         for track in info['tracks']:
 967             video_id = track['id']
 968
 969             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 970             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
 971
 972             self.report_extraction(video_id)
 973             streams = json.loads(stream_json)
 974             mediaURL = streams['http_mp3_128_url']
 975
 976             videos.append({
 977                 'id':       video_id,
 978                 'url':      mediaURL,
 979                 'uploader': track['user']['username'],
 980                 'upload_date':  unified_strdate(track['created_at']),
 981                 'title':    track['title'],
 982                 'ext':      u'mp3',
 983                 'description': track['description'],
 984             })
 985         return videos
 986
 987
 988 class InfoQIE(InfoExtractor):
 989     """Information extractor for infoq.com"""
 990     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 991
 992     def _real_extract(self, url):
 993         mobj = re.match(self._VALID_URL, url)
 994         if mobj is None:
 995             raise ExtractorError(u'Invalid URL: %s' % url)
 996
 997         webpage = self._download_webpage(url, video_id=url)
 998         self.report_extraction(url)
 999
1000         # Extract video URL
1001         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1002         if mobj is None:
1003             raise ExtractorError(u'Unable to extract video url')
1004         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1005         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1006
1007         # Extract title
1008         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1009             webpage, u'title')
1010
1011         # Extract description
1012         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1013             webpage, u'description', fatal=False)
1014
1015         video_filename = video_url.split('/')[-1]
1016         video_id, extension = video_filename.split('.')
1017
1018         info = {
1019             'id': video_id,
1020             'url': video_url,
1021             'uploader': None,
1022             'upload_date': None,
1023             'title': video_title,
1024             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1025             'thumbnail': None,
1026             'description': video_description,
1027         }
1028
1029         return [info]
1030
1031 class MixcloudIE(InfoExtractor):
1032     """Information extractor for www.mixcloud.com"""
1033
1034     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1035     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1036     IE_NAME = u'mixcloud'
1037
1038     def report_download_json(self, file_id):
1039         """Report JSON download."""
1040         self.to_screen(u'Downloading json')
1041
1042     def get_urls(self, jsonData, fmt, bitrate='best'):
1043         """Get urls from 'audio_formats' section in json"""
1044         file_url = None
1045         try:
1046             bitrate_list = jsonData[fmt]
1047             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1048                 bitrate = max(bitrate_list) # select highest
1049
1050             url_list = jsonData[fmt][bitrate]
1051         except TypeError: # we have no bitrate info.
1052             url_list = jsonData[fmt]
1053         return url_list
1054
1055     def check_urls(self, url_list):
1056         """Returns 1st active url from list"""
1057         for url in url_list:
1058             try:
1059                 compat_urllib_request.urlopen(url)
1060                 return url
1061             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1062                 url = None
1063
1064         return None
1065
1066     def _print_formats(self, formats):
1067         print('Available formats:')
1068         for fmt in formats.keys():
1069             for b in formats[fmt]:
1070                 try:
1071                     ext = formats[fmt][b][0]
1072                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1073                 except TypeError: # we have no bitrate info
1074                     ext = formats[fmt][0]
1075                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1076                     break
1077
1078     def _real_extract(self, url):
1079         mobj = re.match(self._VALID_URL, url)
1080         if mobj is None:
1081             raise ExtractorError(u'Invalid URL: %s' % url)
1082         # extract uploader & filename from url
1083         uploader = mobj.group(1).decode('utf-8')
1084         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1085
1086         # construct API request
1087         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1088         # retrieve .json file with links to files
1089         request = compat_urllib_request.Request(file_url)
1090         try:
1091             self.report_download_json(file_url)
1092             jsonData = compat_urllib_request.urlopen(request).read()
1093         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1094             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1095
1096         # parse JSON
1097         json_data = json.loads(jsonData)
1098         player_url = json_data['player_swf_url']
1099         formats = dict(json_data['audio_formats'])
1100
1101         req_format = self._downloader.params.get('format', None)
1102         bitrate = None
1103
1104         if self._downloader.params.get('listformats', None):
1105             self._print_formats(formats)
1106             return
1107
1108         if req_format is None or req_format == 'best':
1109             for format_param in formats.keys():
1110                 url_list = self.get_urls(formats, format_param)
1111                 # check urls
1112                 file_url = self.check_urls(url_list)
1113                 if file_url is not None:
1114                     break # got it!
1115         else:
1116             if req_format not in formats:
1117                 raise ExtractorError(u'Format is not available')
1118
1119             url_list = self.get_urls(formats, req_format)
1120             file_url = self.check_urls(url_list)
1121             format_param = req_format
1122
1123         return [{
1124             'id': file_id.decode('utf-8'),
1125             'url': file_url.decode('utf-8'),
1126             'uploader': uploader.decode('utf-8'),
1127             'upload_date': None,
1128             'title': json_data['name'],
1129             'ext': file_url.split('.')[-1].decode('utf-8'),
1130             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1131             'thumbnail': json_data['thumbnail_url'],
1132             'description': json_data['description'],
1133             'player_url': player_url.decode('utf-8'),
1134         }]
1135
1136 class StanfordOpenClassroomIE(InfoExtractor):
1137     """Information extractor for Stanford's Open ClassRoom"""
1138
1139     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1140     IE_NAME = u'stanfordoc'
1141
1142     def _real_extract(self, url):
1143         mobj = re.match(self._VALID_URL, url)
1144         if mobj is None:
1145             raise ExtractorError(u'Invalid URL: %s' % url)
1146
1147         if mobj.group('course') and mobj.group('video'): # A specific video
1148             course = mobj.group('course')
1149             video = mobj.group('video')
1150             info = {
1151                 'id': course + '_' + video,
1152                 'uploader': None,
1153                 'upload_date': None,
1154             }
1155
1156             self.report_extraction(info['id'])
1157             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1158             xmlUrl = baseUrl + video + '.xml'
1159             try:
1160                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1161             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1163             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1164             try:
1165                 info['title'] = mdoc.findall('./title')[0].text
1166                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1167             except IndexError:
1168                 raise ExtractorError(u'Invalid metadata XML file')
1169             info['ext'] = info['url'].rpartition('.')[2]
1170             return [info]
1171         elif mobj.group('course'): # A course page
1172             course = mobj.group('course')
1173             info = {
1174                 'id': course,
1175                 'type': 'playlist',
1176                 'uploader': None,
1177                 'upload_date': None,
1178             }
1179
1180             coursepage = self._download_webpage(url, info['id'],
1181                                         note='Downloading course info page',
1182                                         errnote='Unable to download course info page')
1183
1184             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1185
1186             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1187                 coursepage, u'description', fatal=False)
1188
1189             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1190             info['list'] = [
1191                 {
1192                     'type': 'reference',
1193                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1194                 }
1195                     for vpage in links]
1196             results = []
1197             for entry in info['list']:
1198                 assert entry['type'] == 'reference'
1199                 results += self.extract(entry['url'])
1200             return results
1201         else: # Root page
1202             info = {
1203                 'id': 'Stanford OpenClassroom',
1204                 'type': 'playlist',
1205                 'uploader': None,
1206                 'upload_date': None,
1207             }
1208
1209             self.report_download_webpage(info['id'])
1210             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1211             try:
1212                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1213             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1214                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1215
1216             info['title'] = info['id']
1217
1218             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1219             info['list'] = [
1220                 {
1221                     'type': 'reference',
1222                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1223                 }
1224                     for cpage in links]
1225
1226             results = []
1227             for entry in info['list']:
1228                 assert entry['type'] == 'reference'
1229                 results += self.extract(entry['url'])
1230             return results
1231
1232 class MTVIE(InfoExtractor):
1233     """Information extractor for MTV.com"""
1234
1235     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1236     IE_NAME = u'mtv'
1237
1238     def _real_extract(self, url):
1239         mobj = re.match(self._VALID_URL, url)
1240         if mobj is None:
1241             raise ExtractorError(u'Invalid URL: %s' % url)
1242         if not mobj.group('proto'):
1243             url = 'http://' + url
1244         video_id = mobj.group('videoid')
1245
1246         webpage = self._download_webpage(url, video_id)
1247
1248         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1249             webpage, u'song name', fatal=False)
1250
1251         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1252             webpage, u'title')
1253
1254         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1255             webpage, u'mtvn_uri', fatal=False)
1256
1257         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1258             webpage, u'content id', fatal=False)
1259
1260         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1261         self.report_extraction(video_id)
1262         request = compat_urllib_request.Request(videogen_url)
1263         try:
1264             metadataXml = compat_urllib_request.urlopen(request).read()
1265         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1267
1268         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1269         renditions = mdoc.findall('.//rendition')
1270
1271         # For now, always pick the highest quality.
1272         rendition = renditions[-1]
1273
1274         try:
1275             _,_,ext = rendition.attrib['type'].partition('/')
1276             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1277             video_url = rendition.find('./src').text
1278         except KeyError:
1279             raise ExtractorError('Invalid rendition field.')
1280
1281         info = {
1282             'id': video_id,
1283             'url': video_url,
1284             'uploader': performer,
1285             'upload_date': None,
1286             'title': video_title,
1287             'ext': ext,
1288             'format': format,
1289         }
1290
1291         return [info]
1292
1293
1294 class YoukuIE(InfoExtractor):
1295     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1296
1297     def _gen_sid(self):
1298         nowTime = int(time.time() * 1000)
1299         random1 = random.randint(1000,1998)
1300         random2 = random.randint(1000,9999)
1301
1302         return "%d%d%d" %(nowTime,random1,random2)
1303
1304     def _get_file_ID_mix_string(self, seed):
1305         mixed = []
1306         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1307         seed = float(seed)
1308         for i in range(len(source)):
1309             seed  =  (seed * 211 + 30031 ) % 65536
1310             index  =  math.floor(seed / 65536 * len(source) )
1311             mixed.append(source[int(index)])
1312             source.remove(source[int(index)])
1313         #return ''.join(mixed)
1314         return mixed
1315
1316     def _get_file_id(self, fileId, seed):
1317         mixed = self._get_file_ID_mix_string(seed)
1318         ids = fileId.split('*')
1319         realId = []
1320         for ch in ids:
1321             if ch:
1322                 realId.append(mixed[int(ch)])
1323         return ''.join(realId)
1324
1325     def _real_extract(self, url):
1326         mobj = re.match(self._VALID_URL, url)
1327         if mobj is None:
1328             raise ExtractorError(u'Invalid URL: %s' % url)
1329         video_id = mobj.group('ID')
1330
1331         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1332
1333         jsondata = self._download_webpage(info_url, video_id)
1334
1335         self.report_extraction(video_id)
1336         try:
1337             config = json.loads(jsondata)
1338
1339             video_title =  config['data'][0]['title']
1340             seed = config['data'][0]['seed']
1341
1342             format = self._downloader.params.get('format', None)
1343             supported_format = list(config['data'][0]['streamfileids'].keys())
1344
1345             if format is None or format == 'best':
1346                 if 'hd2' in supported_format:
1347                     format = 'hd2'
1348                 else:
1349                     format = 'flv'
1350                 ext = u'flv'
1351             elif format == 'worst':
1352                 format = 'mp4'
1353                 ext = u'mp4'
1354             else:
1355                 format = 'flv'
1356                 ext = u'flv'
1357
1358
1359             fileid = config['data'][0]['streamfileids'][format]
1360             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1361         except (UnicodeDecodeError, ValueError, KeyError):
1362             raise ExtractorError(u'Unable to extract info section')
1363
1364         files_info=[]
1365         sid = self._gen_sid()
1366         fileid = self._get_file_id(fileid, seed)
1367
1368         #column 8,9 of fileid represent the segment number
1369         #fileid[7:9] should be changed
1370         for index, key in enumerate(keys):
1371
1372             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1373             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1374
1375             info = {
1376                 'id': '%s_part%02d' % (video_id, index),
1377                 'url': download_url,
1378                 'uploader': None,
1379                 'upload_date': None,
1380                 'title': video_title,
1381                 'ext': ext,
1382             }
1383             files_info.append(info)
1384
1385         return files_info
1386
1387
1388 class XNXXIE(InfoExtractor):
1389     """Information extractor for xnxx.com"""
1390
1391     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1392     IE_NAME = u'xnxx'
1393     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1394     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1395     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1396
1397     def _real_extract(self, url):
1398         mobj = re.match(self._VALID_URL, url)
1399         if mobj is None:
1400             raise ExtractorError(u'Invalid URL: %s' % url)
1401         video_id = mobj.group(1)
1402
1403         # Get webpage content
1404         webpage = self._download_webpage(url, video_id)
1405
1406         video_url = self._search_regex(self.VIDEO_URL_RE,
1407             webpage, u'video URL')
1408         video_url = compat_urllib_parse.unquote(video_url)
1409
1410         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1411             webpage, u'title')
1412
1413         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1414             webpage, u'thumbnail', fatal=False)
1415
1416         return [{
1417             'id': video_id,
1418             'url': video_url,
1419             'uploader': None,
1420             'upload_date': None,
1421             'title': video_title,
1422             'ext': 'flv',
1423             'thumbnail': video_thumbnail,
1424             'description': None,
1425         }]
1426
1427
1428 class GooglePlusIE(InfoExtractor):
1429     """Information extractor for plus.google.com."""
1430
1431     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1432     IE_NAME = u'plus.google'
1433
1434     def _real_extract(self, url):
1435         # Extract id from URL
1436         mobj = re.match(self._VALID_URL, url)
1437         if mobj is None:
1438             raise ExtractorError(u'Invalid URL: %s' % url)
1439
1440         post_url = mobj.group(0)
1441         video_id = mobj.group(1)
1442
1443         video_extension = 'flv'
1444
1445         # Step 1, Retrieve post webpage to extract further information
1446         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1447
1448         self.report_extraction(video_id)
1449
1450         # Extract update date
1451         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1452             webpage, u'upload date', fatal=False)
1453         if upload_date:
1454             # Convert timestring to a format suitable for filename
1455             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1456             upload_date = upload_date.strftime('%Y%m%d')
1457
1458         # Extract uploader
1459         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1460             webpage, u'uploader', fatal=False)
1461
1462         # Extract title
1463         # Get the first line for title
1464         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1465             webpage, 'title', default=u'NA')
1466
1467         # Step 2, Stimulate clicking the image box to launch video
1468         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1469             webpage, u'video page URL')
1470         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1471
1472         # Extract video links on video page
1473         """Extract video links of all sizes"""
1474         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1475         mobj = re.findall(pattern, webpage)
1476         if len(mobj) == 0:
1477             raise ExtractorError(u'Unable to extract video links')
1478
1479         # Sort in resolution
1480         links = sorted(mobj)
1481
1482         # Choose the lowest of the sort, i.e. highest resolution
1483         video_url = links[-1]
1484         # Only get the url. The resolution part in the tuple has no use anymore
1485         video_url = video_url[-1]
1486         # Treat escaped \u0026 style hex
1487         try:
1488             video_url = video_url.decode("unicode_escape")
1489         except AttributeError: # Python 3
1490             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1491
1492
1493         return [{
1494             'id':       video_id,
1495             'url':      video_url,
1496             'uploader': uploader,
1497             'upload_date':  upload_date,
1498             'title':    video_title,
1499             'ext':      video_extension,
1500         }]
1501
1502 class NBAIE(InfoExtractor):
1503     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1504     IE_NAME = u'nba'
1505
1506     def _real_extract(self, url):
1507         mobj = re.match(self._VALID_URL, url)
1508         if mobj is None:
1509             raise ExtractorError(u'Invalid URL: %s' % url)
1510
1511         video_id = mobj.group(1)
1512
1513         webpage = self._download_webpage(url, video_id)
1514
1515         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1516
1517         shortened_video_id = video_id.rpartition('/')[2]
1518         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1519             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1520
1521         # It isn't there in the HTML it returns to us
1522         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1523
1524         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1525
1526         info = {
1527             'id': shortened_video_id,
1528             'url': video_url,
1529             'ext': 'mp4',
1530             'title': title,
1531             # 'uploader_date': uploader_date,
1532             'description': description,
1533         }
1534         return [info]
1535
1536 class JustinTVIE(InfoExtractor):
1537     """Information extractor for justin.tv and twitch.tv"""
1538     # TODO: One broadcast may be split into multiple videos. The key
1539     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1540     # starts at 1 and increases. Can we treat all parts as one video?
1541
1542     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1543         (?:
1544             (?P<channelid>[^/]+)|
1545             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1546             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1547         )
1548         /?(?:\#.*)?$
1549         """
1550     _JUSTIN_PAGE_LIMIT = 100
1551     IE_NAME = u'justin.tv'
1552
1553     def report_download_page(self, channel, offset):
1554         """Report attempt to download a single page of videos."""
1555         self.to_screen(u'%s: Downloading video information from %d to %d' %
1556                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1557
1558     # Return count of items, list of *valid* items
1559     def _parse_page(self, url, video_id):
1560         webpage = self._download_webpage(url, video_id,
1561                                          u'Downloading video info JSON',
1562                                          u'unable to download video info JSON')
1563
1564         response = json.loads(webpage)
1565         if type(response) != list:
1566             error_text = response.get('error', 'unknown error')
1567             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1568         info = []
1569         for clip in response:
1570             video_url = clip['video_file_url']
1571             if video_url:
1572                 video_extension = os.path.splitext(video_url)[1][1:]
1573                 video_date = re.sub('-', '', clip['start_time'][:10])
1574                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1575                 video_id = clip['id']
1576                 video_title = clip.get('title', video_id)
1577                 info.append({
1578                     'id': video_id,
1579                     'url': video_url,
1580                     'title': video_title,
1581                     'uploader': clip.get('channel_name', video_uploader_id),
1582                     'uploader_id': video_uploader_id,
1583                     'upload_date': video_date,
1584                     'ext': video_extension,
1585                 })
1586         return (len(response), info)
1587
1588     def _real_extract(self, url):
1589         mobj = re.match(self._VALID_URL, url)
1590         if mobj is None:
1591             raise ExtractorError(u'invalid URL: %s' % url)
1592
1593         api_base = 'http://api.justin.tv'
1594         paged = False
1595         if mobj.group('channelid'):
1596             paged = True
1597             video_id = mobj.group('channelid')
1598             api = api_base + '/channel/archives/%s.json' % video_id
1599         elif mobj.group('chapterid'):
1600             chapter_id = mobj.group('chapterid')
1601
1602             webpage = self._download_webpage(url, chapter_id)
1603             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1604             if not m:
1605                 raise ExtractorError(u'Cannot find archive of a chapter')
1606             archive_id = m.group(1)
1607
1608             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1609             chapter_info_xml = self._download_webpage(api, chapter_id,
1610                                              note=u'Downloading chapter information',
1611                                              errnote=u'Chapter information download failed')
1612             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1613             for a in doc.findall('.//archive'):
1614                 if archive_id == a.find('./id').text:
1615                     break
1616             else:
1617                 raise ExtractorError(u'Could not find chapter in chapter information')
1618
1619             video_url = a.find('./video_file_url').text
1620             video_ext = video_url.rpartition('.')[2] or u'flv'
1621
1622             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1623             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1624                                    note='Downloading chapter metadata',
1625                                    errnote='Download of chapter metadata failed')
1626             chapter_info = json.loads(chapter_info_json)
1627
1628             bracket_start = int(doc.find('.//bracket_start').text)
1629             bracket_end = int(doc.find('.//bracket_end').text)
1630
1631             # TODO determine start (and probably fix up file)
1632             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1633             #video_url += u'?start=' + TODO:start_timestamp
1634             # bracket_start is 13290, but we want 51670615
1635             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1636                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1637
1638             info = {
1639                 'id': u'c' + chapter_id,
1640                 'url': video_url,
1641                 'ext': video_ext,
1642                 'title': chapter_info['title'],
1643                 'thumbnail': chapter_info['preview'],
1644                 'description': chapter_info['description'],
1645                 'uploader': chapter_info['channel']['display_name'],
1646                 'uploader_id': chapter_info['channel']['name'],
1647             }
1648             return [info]
1649         else:
1650             video_id = mobj.group('videoid')
1651             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1652
1653         self.report_extraction(video_id)
1654
1655         info = []
1656         offset = 0
1657         limit = self._JUSTIN_PAGE_LIMIT
1658         while True:
1659             if paged:
1660                 self.report_download_page(video_id, offset)
1661             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1662             page_count, page_info = self._parse_page(page_url, video_id)
1663             info.extend(page_info)
1664             if not paged or page_count != limit:
1665                 break
1666             offset += limit
1667         return info
1668
1669 class FunnyOrDieIE(InfoExtractor):
1670     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1671
1672     def _real_extract(self, url):
1673         mobj = re.match(self._VALID_URL, url)
1674         if mobj is None:
1675             raise ExtractorError(u'invalid URL: %s' % url)
1676
1677         video_id = mobj.group('id')
1678         webpage = self._download_webpage(url, video_id)
1679
1680         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1681             webpage, u'video URL', flags=re.DOTALL)
1682
1683         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1684             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1685
1686         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1687             webpage, u'description', fatal=False, flags=re.DOTALL)
1688
1689         info = {
1690             'id': video_id,
1691             'url': video_url,
1692             'ext': 'mp4',
1693             'title': title,
1694             'description': video_description,
1695         }
1696         return [info]
1697
1698 class SteamIE(InfoExtractor):
1699     _VALID_URL = r"""http://store\.steampowered\.com/
1700                 (agecheck/)?
1701                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1702                 (?P<gameID>\d+)/?
1703                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1704                 """
1705     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1706     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1707
1708     @classmethod
1709     def suitable(cls, url):
1710         """Receives a URL and returns True if suitable for this IE."""
1711         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1712
1713     def _real_extract(self, url):
1714         m = re.match(self._VALID_URL, url, re.VERBOSE)
1715         gameID = m.group('gameID')
1716
1717         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1718         webpage = self._download_webpage(videourl, gameID)
1719
1720         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1721             videourl = self._AGECHECK_TEMPLATE % gameID
1722             self.report_age_confirmation()
1723             webpage = self._download_webpage(videourl, gameID)
1724
1725         self.report_extraction(gameID)
1726         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1727                                              webpage, 'game title')
1728
1729         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1730         mweb = re.finditer(urlRE, webpage)
1731         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1732         titles = re.finditer(namesRE, webpage)
1733         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1734         thumbs = re.finditer(thumbsRE, webpage)
1735         videos = []
1736         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1737             video_id = vid.group('videoID')
1738             title = vtitle.group('videoName')
1739             video_url = vid.group('videoURL')
1740             video_thumb = thumb.group('thumbnail')
1741             if not video_url:
1742                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1743             info = {
1744                 'id':video_id,
1745                 'url':video_url,
1746                 'ext': 'flv',
1747                 'title': unescapeHTML(title),
1748                 'thumbnail': video_thumb
1749                   }
1750             videos.append(info)
1751         return [self.playlist_result(videos, gameID, game_title)]
1752
1753 class UstreamIE(InfoExtractor):
1754     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1755     IE_NAME = u'ustream'
1756
1757     def _real_extract(self, url):
1758         m = re.match(self._VALID_URL, url)
1759         video_id = m.group('videoID')
1760
1761         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1762         webpage = self._download_webpage(url, video_id)
1763
1764         self.report_extraction(video_id)
1765
1766         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1767             webpage, u'title')
1768
1769         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1770             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1771
1772         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1773             webpage, u'thumbnail', fatal=False)
1774
1775         info = {
1776                 'id': video_id,
1777                 'url': video_url,
1778                 'ext': 'flv',
1779                 'title': video_title,
1780                 'uploader': uploader,
1781                 'thumbnail': thumbnail,
1782                }
1783         return info
1784
1785 class WorldStarHipHopIE(InfoExtractor):
1786     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1787     IE_NAME = u'WorldStarHipHop'
1788
1789     def _real_extract(self, url):
1790         m = re.match(self._VALID_URL, url)
1791         video_id = m.group('id')
1792
1793         webpage_src = self._download_webpage(url, video_id)
1794
1795         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1796             webpage_src, u'video URL')
1797
1798         if 'mp4' in video_url:
1799             ext = 'mp4'
1800         else:
1801             ext = 'flv'
1802
1803         video_title = self._html_search_regex(r"<title>(.*)</title>",
1804             webpage_src, u'title')
1805
1806         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1807         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1808             webpage_src, u'thumbnail', fatal=False)
1809
1810         if not thumbnail:
1811             _title = r"""candytitles.*>(.*)</span>"""
1812             mobj = re.search(_title, webpage_src)
1813             if mobj is not None:
1814                 video_title = mobj.group(1)
1815
1816         results = [{
1817                     'id': video_id,
1818                     'url' : video_url,
1819                     'title' : video_title,
1820                     'thumbnail' : thumbnail,
1821                     'ext' : ext,
1822                     }]
1823         return results
1824
1825 class RBMARadioIE(InfoExtractor):
1826     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1827
1828     def _real_extract(self, url):
1829         m = re.match(self._VALID_URL, url)
1830         video_id = m.group('videoID')
1831
1832         webpage = self._download_webpage(url, video_id)
1833
1834         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1835             webpage, u'json data', flags=re.MULTILINE)
1836
1837         try:
1838             data = json.loads(json_data)
1839         except ValueError as e:
1840             raise ExtractorError(u'Invalid JSON: ' + str(e))
1841
1842         video_url = data['akamai_url'] + '&cbr=256'
1843         url_parts = compat_urllib_parse_urlparse(video_url)
1844         video_ext = url_parts.path.rpartition('.')[2]
1845         info = {
1846                 'id': video_id,
1847                 'url': video_url,
1848                 'ext': video_ext,
1849                 'title': data['title'],
1850                 'description': data.get('teaser_text'),
1851                 'location': data.get('country_of_origin'),
1852                 'uploader': data.get('host', {}).get('name'),
1853                 'uploader_id': data.get('host', {}).get('slug'),
1854                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1855                 'duration': data.get('duration'),
1856         }
1857         return [info]
1858
1859
1860 class YouPornIE(InfoExtractor):
1861     """Information extractor for youporn.com."""
1862     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1863
1864     def _print_formats(self, formats):
1865         """Print all available formats"""
1866         print(u'Available formats:')
1867         print(u'ext\t\tformat')
1868         print(u'---------------------------------')
1869         for format in formats:
1870             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1871
1872     def _specific(self, req_format, formats):
1873         for x in formats:
1874             if(x["format"]==req_format):
1875                 return x
1876         return None
1877
1878     def _real_extract(self, url):
1879         mobj = re.match(self._VALID_URL, url)
1880         if mobj is None:
1881             raise ExtractorError(u'Invalid URL: %s' % url)
1882         video_id = mobj.group('videoid')
1883
1884         req = compat_urllib_request.Request(url)
1885         req.add_header('Cookie', 'age_verified=1')
1886         webpage = self._download_webpage(req, video_id)
1887
1888         # Get JSON parameters
1889         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1890         try:
1891             params = json.loads(json_params)
1892         except:
1893             raise ExtractorError(u'Invalid JSON')
1894
1895         self.report_extraction(video_id)
1896         try:
1897             video_title = params['title']
1898             upload_date = unified_strdate(params['release_date_f'])
1899             video_description = params['description']
1900             video_uploader = params['submitted_by']
1901             thumbnail = params['thumbnails'][0]['image']
1902         except KeyError:
1903             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1904
1905         # Get all of the formats available
1906         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1907         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1908             webpage, u'download list').strip()
1909
1910         # Get all of the links from the page
1911         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1912         links = re.findall(LINK_RE, download_list_html)
1913         if(len(links) == 0):
1914             raise ExtractorError(u'ERROR: no known formats available for video')
1915
1916         self.to_screen(u'Links found: %d' % len(links))
1917
1918         formats = []
1919         for link in links:
1920
1921             # A link looks like this:
1922             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1923             # A path looks like this:
1924             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1925             video_url = unescapeHTML( link )
1926             path = compat_urllib_parse_urlparse( video_url ).path
1927             extension = os.path.splitext( path )[1][1:]
1928             format = path.split('/')[4].split('_')[:2]
1929             size = format[0]
1930             bitrate = format[1]
1931             format = "-".join( format )
1932             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1933
1934             formats.append({
1935                 'id': video_id,
1936                 'url': video_url,
1937                 'uploader': video_uploader,
1938                 'upload_date': upload_date,
1939                 'title': video_title,
1940                 'ext': extension,
1941                 'format': format,
1942                 'thumbnail': thumbnail,
1943                 'description': video_description
1944             })
1945
1946         if self._downloader.params.get('listformats', None):
1947             self._print_formats(formats)
1948             return
1949
1950         req_format = self._downloader.params.get('format', None)
1951         self.to_screen(u'Format: %s' % req_format)
1952
1953         if req_format is None or req_format == 'best':
1954             return [formats[0]]
1955         elif req_format == 'worst':
1956             return [formats[-1]]
1957         elif req_format in ('-1', 'all'):
1958             return formats
1959         else:
1960             format = self._specific( req_format, formats )
1961             if result is None:
1962                 raise ExtractorError(u'Requested format not available')
1963             return [format]
1964
1965
1966
1967 class PornotubeIE(InfoExtractor):
1968     """Information extractor for pornotube.com."""
1969     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1970
1971     def _real_extract(self, url):
1972         mobj = re.match(self._VALID_URL, url)
1973         if mobj is None:
1974             raise ExtractorError(u'Invalid URL: %s' % url)
1975
1976         video_id = mobj.group('videoid')
1977         video_title = mobj.group('title')
1978
1979         # Get webpage content
1980         webpage = self._download_webpage(url, video_id)
1981
1982         # Get the video URL
1983         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1984         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1985         video_url = compat_urllib_parse.unquote(video_url)
1986
1987         #Get the uploaded date
1988         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1989         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1990         if upload_date: upload_date = unified_strdate(upload_date)
1991
1992         info = {'id': video_id,
1993                 'url': video_url,
1994                 'uploader': None,
1995                 'upload_date': upload_date,
1996                 'title': video_title,
1997                 'ext': 'flv',
1998                 'format': 'flv'}
1999
2000         return [info]
2001
2002 class YouJizzIE(InfoExtractor):
2003     """Information extractor for youjizz.com."""
2004     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2005
2006     def _real_extract(self, url):
2007         mobj = re.match(self._VALID_URL, url)
2008         if mobj is None:
2009             raise ExtractorError(u'Invalid URL: %s' % url)
2010
2011         video_id = mobj.group('videoid')
2012
2013         # Get webpage content
2014         webpage = self._download_webpage(url, video_id)
2015
2016         # Get the video title
2017         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2018             webpage, u'title').strip()
2019
2020         # Get the embed page
2021         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2022         if result is None:
2023             raise ExtractorError(u'ERROR: unable to extract embed page')
2024
2025         embed_page_url = result.group(0).strip()
2026         video_id = result.group('videoid')
2027
2028         webpage = self._download_webpage(embed_page_url, video_id)
2029
2030         # Get the video URL
2031         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2032             webpage, u'video URL')
2033
2034         info = {'id': video_id,
2035                 'url': video_url,
2036                 'title': video_title,
2037                 'ext': 'flv',
2038                 'format': 'flv',
2039                 'player_url': embed_page_url}
2040
2041         return [info]
2042
2043 class EightTracksIE(InfoExtractor):
2044     IE_NAME = '8tracks'
2045     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2046
2047     def _real_extract(self, url):
2048         mobj = re.match(self._VALID_URL, url)
2049         if mobj is None:
2050             raise ExtractorError(u'Invalid URL: %s' % url)
2051         playlist_id = mobj.group('id')
2052
2053         webpage = self._download_webpage(url, playlist_id)
2054
2055         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2056         data = json.loads(json_like)
2057
2058         session = str(random.randint(0, 1000000000))
2059         mix_id = data['id']
2060         track_count = data['tracks_count']
2061         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2062         next_url = first_url
2063         res = []
2064         for i in itertools.count():
2065             api_json = self._download_webpage(next_url, playlist_id,
2066                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2067                 errnote=u'Failed to download song information')
2068             api_data = json.loads(api_json)
2069             track_data = api_data[u'set']['track']
2070             info = {
2071                 'id': track_data['id'],
2072                 'url': track_data['track_file_stream_url'],
2073                 'title': track_data['performer'] + u' - ' + track_data['name'],
2074                 'raw_title': track_data['name'],
2075                 'uploader_id': data['user']['login'],
2076                 'ext': 'm4a',
2077             }
2078             res.append(info)
2079             if api_data['set']['at_last_track']:
2080                 break
2081             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2082         return res
2083
2084 class KeekIE(InfoExtractor):
2085     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2086     IE_NAME = u'keek'
2087
2088     def _real_extract(self, url):
2089         m = re.match(self._VALID_URL, url)
2090         video_id = m.group('videoID')
2091
2092         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2093         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2094         webpage = self._download_webpage(url, video_id)
2095
2096         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2097             webpage, u'title')
2098
2099         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2100             webpage, u'uploader', fatal=False)
2101
2102         info = {
2103                 'id': video_id,
2104                 'url': video_url,
2105                 'ext': 'mp4',
2106                 'title': video_title,
2107                 'thumbnail': thumbnail,
2108                 'uploader': uploader
2109         }
2110         return [info]
2111
2112 class TEDIE(InfoExtractor):
2113     _VALID_URL=r'''http://www\.ted\.com/
2114                    (
2115                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2116                         |
2117                         ((?P<type_talk>talks)) # We have a simple talk
2118                    )
2119                    (/lang/(.*?))? # The url may contain the language
2120                    /(?P<name>\w+) # Here goes the name and then ".html"
2121                    '''
2122
2123     @classmethod
2124     def suitable(cls, url):
2125         """Receives a URL and returns True if suitable for this IE."""
2126         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2127
2128     def _real_extract(self, url):
2129         m=re.match(self._VALID_URL, url, re.VERBOSE)
2130         if m.group('type_talk'):
2131             return [self._talk_info(url)]
2132         else :
2133             playlist_id=m.group('playlist_id')
2134             name=m.group('name')
2135             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2136             return [self._playlist_videos_info(url,name,playlist_id)]
2137
2138     def _playlist_videos_info(self,url,name,playlist_id=0):
2139         '''Returns the videos of the playlist'''
2140         video_RE=r'''
2141                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2142                      ([.\s]*?)data-playlist_item_id="(\d+)"
2143                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2144                      '''
2145         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2146         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2147         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2148         m_names=re.finditer(video_name_RE,webpage)
2149
2150         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2151                                                  webpage, 'playlist title')
2152
2153         playlist_entries = []
2154         for m_video, m_name in zip(m_videos,m_names):
2155             video_id=m_video.group('video_id')
2156             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2157             playlist_entries.append(self.url_result(talk_url, 'TED'))
2158         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2159
2160     def _talk_info(self, url, video_id=0):
2161         """Return the video for the talk in the url"""
2162         m = re.match(self._VALID_URL, url,re.VERBOSE)
2163         video_name = m.group('name')
2164         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2165         self.report_extraction(video_name)
2166         # If the url includes the language we get the title translated
2167         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2168                                         webpage, 'title')
2169         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2170                                     webpage, 'json data')
2171         info = json.loads(json_data)
2172         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2173                                        webpage, 'description', flags = re.DOTALL)
2174
2175         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2176                                        webpage, 'thumbnail')
2177         info = {
2178                 'id': info['id'],
2179                 'url': info['htmlStreams'][-1]['file'],
2180                 'ext': 'mp4',
2181                 'title': title,
2182                 'thumbnail': thumbnail,
2183                 'description': desc,
2184                 }
2185         return info
2186
2187 class MySpassIE(InfoExtractor):
2188     _VALID_URL = r'http://www.myspass.de/.*'
2189
2190     def _real_extract(self, url):
2191         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2192
2193         # video id is the last path element of the URL
2194         # usually there is a trailing slash, so also try the second but last
2195         url_path = compat_urllib_parse_urlparse(url).path
2196         url_parent_path, video_id = os.path.split(url_path)
2197         if not video_id:
2198             _, video_id = os.path.split(url_parent_path)
2199
2200         # get metadata
2201         metadata_url = META_DATA_URL_TEMPLATE % video_id
2202         metadata_text = self._download_webpage(metadata_url, video_id)
2203         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2204
2205         # extract values from metadata
2206         url_flv_el = metadata.find('url_flv')
2207         if url_flv_el is None:
2208             raise ExtractorError(u'Unable to extract download url')
2209         video_url = url_flv_el.text
2210         extension = os.path.splitext(video_url)[1][1:]
2211         title_el = metadata.find('title')
2212         if title_el is None:
2213             raise ExtractorError(u'Unable to extract title')
2214         title = title_el.text
2215         format_id_el = metadata.find('format_id')
2216         if format_id_el is None:
2217             format = ext
2218         else:
2219             format = format_id_el.text
2220         description_el = metadata.find('description')
2221         if description_el is not None:
2222             description = description_el.text
2223         else:
2224             description = None
2225         imagePreview_el = metadata.find('imagePreview')
2226         if imagePreview_el is not None:
2227             thumbnail = imagePreview_el.text
2228         else:
2229             thumbnail = None
2230         info = {
2231             'id': video_id,
2232             'url': video_url,
2233             'title': title,
2234             'ext': extension,
2235             'format': format,
2236             'thumbnail': thumbnail,
2237             'description': description
2238         }
2239         return [info]
2240
2241 class SpiegelIE(InfoExtractor):
2242     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2243
2244     def _real_extract(self, url):
2245         m = re.match(self._VALID_URL, url)
2246         video_id = m.group('videoID')
2247
2248         webpage = self._download_webpage(url, video_id)
2249
2250         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2251             webpage, u'title')
2252
2253         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2254         xml_code = self._download_webpage(xml_url, video_id,
2255                     note=u'Downloading XML', errnote=u'Failed to download XML')
2256
2257         idoc = xml.etree.ElementTree.fromstring(xml_code)
2258         last_type = idoc[-1]
2259         filename = last_type.findall('./filename')[0].text
2260         duration = float(last_type.findall('./duration')[0].text)
2261
2262         video_url = 'http://video2.spiegel.de/flash/' + filename
2263         video_ext = filename.rpartition('.')[2]
2264         info = {
2265             'id': video_id,
2266             'url': video_url,
2267             'ext': video_ext,
2268             'title': video_title,
2269             'duration': duration,
2270         }
2271         return [info]
2272
2273 class LiveLeakIE(InfoExtractor):
2274
2275     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2276     IE_NAME = u'liveleak'
2277
2278     def _real_extract(self, url):
2279         mobj = re.match(self._VALID_URL, url)
2280         if mobj is None:
2281             raise ExtractorError(u'Invalid URL: %s' % url)
2282
2283         video_id = mobj.group('video_id')
2284
2285         webpage = self._download_webpage(url, video_id)
2286
2287         video_url = self._search_regex(r'file: "(.*?)",',
2288             webpage, u'video URL')
2289
2290         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2291             webpage, u'title').replace('LiveLeak.com -', '').strip()
2292
2293         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2294             webpage, u'description', fatal=False)
2295
2296         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2297             webpage, u'uploader', fatal=False)
2298
2299         info = {
2300             'id':  video_id,
2301             'url': video_url,
2302             'ext': 'mp4',
2303             'title': video_title,
2304             'description': video_description,
2305             'uploader': video_uploader
2306         }
2307
2308         return [info]
2309
2310
2311
2312 class TumblrIE(InfoExtractor):
2313     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2314
2315     def _real_extract(self, url):
2316         m_url = re.match(self._VALID_URL, url)
2317         video_id = m_url.group('id')
2318         blog = m_url.group('blog_name')
2319
2320         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2321         webpage = self._download_webpage(url, video_id)
2322
2323         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2324         video = re.search(re_video, webpage)
2325         if video is None:
2326            raise ExtractorError(u'Unable to extract video')
2327         video_url = video.group('video_url')
2328         ext = video.group('ext')
2329
2330         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2331             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2332         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2333
2334         # The only place where you can get a title, it's not complete,
2335         # but searching in other places doesn't work for all videos
2336         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2337             webpage, u'title', flags=re.DOTALL)
2338
2339         return [{'id': video_id,
2340                  'url': video_url,
2341                  'title': video_title,
2342                  'thumbnail': video_thumbnail,
2343                  'ext': ext
2344                  }]
2345
2346 class BandcampIE(InfoExtractor):
2347     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2348
2349     def _real_extract(self, url):
2350         mobj = re.match(self._VALID_URL, url)
2351         title = mobj.group('title')
2352         webpage = self._download_webpage(url, title)
2353         # We get the link to the free download page
2354         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2355         if m_download is None:
2356             raise ExtractorError(u'No free songs found')
2357
2358         download_link = m_download.group(1)
2359         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2360                        webpage, re.MULTILINE|re.DOTALL).group('id')
2361
2362         download_webpage = self._download_webpage(download_link, id,
2363                                                   'Downloading free downloads page')
2364         # We get the dictionary of the track from some javascrip code
2365         info = re.search(r'items: (.*?),$',
2366                          download_webpage, re.MULTILINE).group(1)
2367         info = json.loads(info)[0]
2368         # We pick mp3-320 for now, until format selection can be easily implemented.
2369         mp3_info = info[u'downloads'][u'mp3-320']
2370         # If we try to use this url it says the link has expired
2371         initial_url = mp3_info[u'url']
2372         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2373         m_url = re.match(re_url, initial_url)
2374         #We build the url we will use to get the final track url
2375         # This url is build in Bandcamp in the script download_bunde_*.js
2376         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2377         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2378         # If we could correctly generate the .rand field the url would be
2379         #in the "download_url" key
2380         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2381
2382         track_info = {'id':id,
2383                       'title' : info[u'title'],
2384                       'ext' :   'mp3',
2385                       'url' :   final_url,
2386                       'thumbnail' : info[u'thumb_url'],
2387                       'uploader' :  info[u'artist']
2388                       }
2389
2390         return [track_info]
2391
2392 class RedTubeIE(InfoExtractor):
2393     """Information Extractor for redtube"""
2394     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2395
2396     def _real_extract(self,url):
2397         mobj = re.match(self._VALID_URL, url)
2398         if mobj is None:
2399             raise ExtractorError(u'Invalid URL: %s' % url)
2400
2401         video_id = mobj.group('id')
2402         video_extension = 'mp4'
2403         webpage = self._download_webpage(url, video_id)
2404
2405         self.report_extraction(video_id)
2406
2407         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2408             webpage, u'video URL')
2409
2410         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2411             webpage, u'title')
2412
2413         return [{
2414             'id':       video_id,
2415             'url':      video_url,
2416             'ext':      video_extension,
2417             'title':    video_title,
2418         }]
2419
2420 class InaIE(InfoExtractor):
2421     """Information Extractor for Ina.fr"""
2422     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2423
2424     def _real_extract(self,url):
2425         mobj = re.match(self._VALID_URL, url)
2426
2427         video_id = mobj.group('id')
2428         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2429         video_extension = 'mp4'
2430         webpage = self._download_webpage(mrss_url, video_id)
2431
2432         self.report_extraction(video_id)
2433
2434         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2435             webpage, u'video URL')
2436
2437         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2438             webpage, u'title')
2439
2440         return [{
2441             'id':       video_id,
2442             'url':      video_url,
2443             'ext':      video_extension,
2444             'title':    video_title,
2445         }]
2446
2447 class HowcastIE(InfoExtractor):
2448     """Information Extractor for Howcast.com"""
2449     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2450
2451     def _real_extract(self, url):
2452         mobj = re.match(self._VALID_URL, url)
2453
2454         video_id = mobj.group('id')
2455         webpage_url = 'http://www.howcast.com/videos/' + video_id
2456         webpage = self._download_webpage(webpage_url, video_id)
2457
2458         self.report_extraction(video_id)
2459
2460         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2461             webpage, u'video URL')
2462
2463         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2464             webpage, u'title')
2465
2466         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2467             webpage, u'description', fatal=False)
2468
2469         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2470             webpage, u'thumbnail', fatal=False)
2471
2472         return [{
2473             'id':       video_id,
2474             'url':      video_url,
2475             'ext':      'mp4',
2476             'title':    video_title,
2477             'description': video_description,
2478             'thumbnail': thumbnail,
2479         }]
2480
2481 class VineIE(InfoExtractor):
2482     """Information Extractor for Vine.co"""
2483     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2484
2485     def _real_extract(self, url):
2486         mobj = re.match(self._VALID_URL, url)
2487
2488         video_id = mobj.group('id')
2489         webpage_url = 'https://vine.co/v/' + video_id
2490         webpage = self._download_webpage(webpage_url, video_id)
2491
2492         self.report_extraction(video_id)
2493
2494         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2495             webpage, u'video URL')
2496
2497         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2498             webpage, u'title')
2499
2500         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2501             webpage, u'thumbnail', fatal=False)
2502
2503         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2504             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2505
2506         return [{
2507             'id':        video_id,
2508             'url':       video_url,
2509             'ext':       'mp4',
2510             'title':     video_title,
2511             'thumbnail': thumbnail,
2512             'uploader':  uploader,
2513         }]
2514
2515 class FlickrIE(InfoExtractor):
2516     """Information Extractor for Flickr videos"""
2517     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2518
2519     def _real_extract(self, url):
2520         mobj = re.match(self._VALID_URL, url)
2521
2522         video_id = mobj.group('id')
2523         video_uploader_id = mobj.group('uploader_id')
2524         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2525         webpage = self._download_webpage(webpage_url, video_id)
2526
2527         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2528
2529         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2530         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2531
2532         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2533             first_xml, u'node_id')
2534
2535         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2536         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2537
2538         self.report_extraction(video_id)
2539
2540         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2541         if mobj is None:
2542             raise ExtractorError(u'Unable to extract video url')
2543         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2544
2545         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2546             webpage, u'video title')
2547
2548         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2549             webpage, u'description', fatal=False)
2550
2551         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2552             webpage, u'thumbnail', fatal=False)
2553
2554         return [{
2555             'id':          video_id,
2556             'url':         video_url,
2557             'ext':         'mp4',
2558             'title':       video_title,
2559             'description': video_description,
2560             'thumbnail':   thumbnail,
2561             'uploader_id': video_uploader_id,
2562         }]
2563
2564 class TeamcocoIE(InfoExtractor):
2565     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2566
2567     def _real_extract(self, url):
2568         mobj = re.match(self._VALID_URL, url)
2569         if mobj is None:
2570             raise ExtractorError(u'Invalid URL: %s' % url)
2571         url_title = mobj.group('url_title')
2572         webpage = self._download_webpage(url, url_title)
2573
2574         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2575             webpage, u'video id')
2576
2577         self.report_extraction(video_id)
2578
2579         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2580             webpage, u'title')
2581
2582         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2583             webpage, u'thumbnail', fatal=False)
2584
2585         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2586             webpage, u'description', fatal=False)
2587
2588         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2589         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2590
2591         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2592             data, u'video URL')
2593
2594         return [{
2595             'id':          video_id,
2596             'url':         video_url,
2597             'ext':         'mp4',
2598             'title':       video_title,
2599             'thumbnail':   thumbnail,
2600             'description': video_description,
2601         }]
2602
2603 class XHamsterIE(InfoExtractor):
2604     """Information Extractor for xHamster"""
2605     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2606
2607     def _real_extract(self,url):
2608         mobj = re.match(self._VALID_URL, url)
2609
2610         video_id = mobj.group('id')
2611         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2612         webpage = self._download_webpage(mrss_url, video_id)
2613
2614         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2615         if mobj is None:
2616             raise ExtractorError(u'Unable to extract media URL')
2617         if len(mobj.group('server')) == 0:
2618             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2619         else:
2620             video_url = mobj.group('server')+'/key='+mobj.group('file')
2621         video_extension = video_url.split('.')[-1]
2622
2623         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2624             webpage, u'title')
2625
2626         # Can't see the description anywhere in the UI
2627         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2628         #     webpage, u'description', fatal=False)
2629         # if video_description: video_description = unescapeHTML(video_description)
2630
2631         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2632         if mobj:
2633             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2634         else:
2635             video_upload_date = None
2636             self._downloader.report_warning(u'Unable to extract upload date')
2637
2638         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2639             webpage, u'uploader id', default=u'anonymous')
2640
2641         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2642             webpage, u'thumbnail', fatal=False)
2643
2644         return [{
2645             'id':       video_id,
2646             'url':      video_url,
2647             'ext':      video_extension,
2648             'title':    video_title,
2649             # 'description': video_description,
2650             'upload_date': video_upload_date,
2651             'uploader_id': video_uploader_id,
2652             'thumbnail': video_thumbnail
2653         }]
2654
2655 class HypemIE(InfoExtractor):
2656     """Information Extractor for hypem"""
2657     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2658
2659     def _real_extract(self, url):
2660         mobj = re.match(self._VALID_URL, url)
2661         if mobj is None:
2662             raise ExtractorError(u'Invalid URL: %s' % url)
2663         track_id = mobj.group(1)
2664
2665         data = { 'ax': 1, 'ts': time.time() }
2666         data_encoded = compat_urllib_parse.urlencode(data)
2667         complete_url = url + "?" + data_encoded
2668         request = compat_urllib_request.Request(complete_url)
2669         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2670         cookie = urlh.headers.get('Set-Cookie', '')
2671
2672         self.report_extraction(track_id)
2673
2674         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2675             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2676         try:
2677             track_list = json.loads(html_tracks)
2678             track = track_list[u'tracks'][0]
2679         except ValueError:
2680             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2681
2682         key = track[u"key"]
2683         track_id = track[u"id"]
2684         artist = track[u"artist"]
2685         title = track[u"song"]
2686
2687         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2688         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2689         request.add_header('cookie', cookie)
2690         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2691         try:
2692             song_data = json.loads(song_data_json)
2693         except ValueError:
2694             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2695         final_url = song_data[u"url"]
2696
2697         return [{
2698             'id':       track_id,
2699             'url':      final_url,
2700             'ext':      "mp3",
2701             'title':    title,
2702             'artist':   artist,
2703         }]
2704
2705 class Vbox7IE(InfoExtractor):
2706     """Information Extractor for Vbox7"""
2707     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2708
2709     def _real_extract(self,url):
2710         mobj = re.match(self._VALID_URL, url)
2711         if mobj is None:
2712             raise ExtractorError(u'Invalid URL: %s' % url)
2713         video_id = mobj.group(1)
2714
2715         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2716         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2717         redirect_url = urlh.geturl() + new_location
2718         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2719
2720         title = self._html_search_regex(r'<title>(.*)</title>',
2721             webpage, u'title').split('/')[0].strip()
2722
2723         ext = "flv"
2724         info_url = "http://vbox7.com/play/magare.do"
2725         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2726         info_request = compat_urllib_request.Request(info_url, data)
2727         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2728         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2729         if info_response is None:
2730             raise ExtractorError(u'Unable to extract the media url')
2731         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2732
2733         return [{
2734             'id':        video_id,
2735             'url':       final_url,
2736             'ext':       ext,
2737             'title':     title,
2738             'thumbnail': thumbnail_url,
2739         }]
2740
2741
2742 def gen_extractors():
2743     """ Return a list of an instance of every supported extractor.
2744     The order does matter; the first extractor matched is the one handling the URL.
2745     """
2746     return [
2747         YoutubePlaylistIE(),
2748         YoutubeChannelIE(),
2749         YoutubeUserIE(),
2750         YoutubeSearchIE(),
2751         YoutubeIE(),
2752         MetacafeIE(),
2753         DailymotionIE(),
2754         GoogleSearchIE(),
2755         PhotobucketIE(),
2756         YahooIE(),
2757         YahooSearchIE(),
2758         DepositFilesIE(),
2759         FacebookIE(),
2760         BlipTVIE(),
2761         BlipTVUserIE(),
2762         VimeoIE(),
2763         MyVideoIE(),
2764         ComedyCentralIE(),
2765         EscapistIE(),
2766         CollegeHumorIE(),
2767         XVideosIE(),
2768         SoundcloudSetIE(),
2769         SoundcloudIE(),
2770         InfoQIE(),
2771         MixcloudIE(),
2772         StanfordOpenClassroomIE(),
2773         MTVIE(),
2774         YoukuIE(),
2775         XNXXIE(),
2776         YouJizzIE(),
2777         PornotubeIE(),
2778         YouPornIE(),
2779         GooglePlusIE(),
2780         ArteTvIE(),
2781         NBAIE(),
2782         WorldStarHipHopIE(),
2783         JustinTVIE(),
2784         FunnyOrDieIE(),
2785         SteamIE(),
2786         UstreamIE(),
2787         RBMARadioIE(),
2788         EightTracksIE(),
2789         KeekIE(),
2790         TEDIE(),
2791         MySpassIE(),
2792         SpiegelIE(),
2793         LiveLeakIE(),
2794         ARDIE(),
2795         ZDFIE(),
2796         TumblrIE(),
2797         BandcampIE(),
2798         RedTubeIE(),
2799         InaIE(),
2800         HowcastIE(),
2801         VineIE(),
2802         FlickrIE(),
2803         TeamcocoIE(),
2804         XHamsterIE(),
2805         HypemIE(),
2806         Vbox7IE(),
2807         GametrailersIE(),
2808         StatigramIE(),
2809         GenericIE()
2810     ]
2811
2812 def get_info_extractor(ie_name):
2813     """Returns the info extractor class with the given ie_name"""
2814     return globals()[ie_name+'IE']