youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.dailymotion import DailymotionIE
  26 from .extractor.gametrailers import GametrailersIE
  27 from .extractor.generic import GenericIE
  28 from .extractor.googleplus import GooglePlusIE
  29 from .extractor.googlesearch import GoogleSearchIE
  30 from .extractor.metacafe import MetacafeIE
  31 from .extractor.myvideo import MyVideoIE
  32 from .extractor.statigram import StatigramIE
  33 from .extractor.photobucket import PhotobucketIE
  34 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  35 from .extractor.vimeo import VimeoIE
  36 from .extractor.yahoo import YahooIE, YahooSearchIE
  37 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  38 from .extractor.zdf import ZDFIE
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58 class DepositFilesIE(InfoExtractor):
  59     """Information extractor for depositfiles.com"""
  60
  61     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
  62
  63     def _real_extract(self, url):
  64         file_id = url.split('/')[-1]
  65         # Rebuild url in english locale
  66         url = 'http://depositfiles.com/en/files/' + file_id
  67
  68         # Retrieve file webpage with 'Free download' button pressed
  69         free_download_indication = { 'gateway_result' : '1' }
  70         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
  71         try:
  72             self.report_download_webpage(file_id)
  73             webpage = compat_urllib_request.urlopen(request).read()
  74         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  75             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
  76
  77         # Search for the real file URL
  78         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
  79         if (mobj is None) or (mobj.group(1) is None):
  80             # Try to figure out reason of the error.
  81             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
  82             if (mobj is not None) and (mobj.group(1) is not None):
  83                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
  84                 raise ExtractorError(u'%s' % restriction_message)
  85             else:
  86                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
  87
  88         file_url = mobj.group(1)
  89         file_extension = os.path.splitext(file_url)[1][1:]
  90
  91         # Search for file title
  92         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
  93
  94         return [{
  95             'id':       file_id.decode('utf-8'),
  96             'url':      file_url.decode('utf-8'),
  97             'uploader': None,
  98             'upload_date':  None,
  99             'title':    file_title,
 100             'ext':      file_extension.decode('utf-8'),
 101         }]
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111 class EscapistIE(InfoExtractor):
 112     """Information extractor for The Escapist """
 113
 114     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 115     IE_NAME = u'escapist'
 116
 117     def _real_extract(self, url):
 118         mobj = re.match(self._VALID_URL, url)
 119         if mobj is None:
 120             raise ExtractorError(u'Invalid URL: %s' % url)
 121         showName = mobj.group('showname')
 122         videoId = mobj.group('episode')
 123
 124         self.report_extraction(videoId)
 125         webpage = self._download_webpage(url, videoId)
 126
 127         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 128             webpage, u'description', fatal=False)
 129
 130         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 131             webpage, u'thumbnail', fatal=False)
 132
 133         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 134             webpage, u'player url')
 135
 136         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 137             webpage, u'player url').split(' : ')[-1]
 138
 139         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 140         configUrl = compat_urllib_parse.unquote(configUrl)
 141
 142         configJSON = self._download_webpage(configUrl, videoId,
 143                                             u'Downloading configuration',
 144                                             u'unable to download configuration')
 145
 146         # Technically, it's JavaScript, not JSON
 147         configJSON = configJSON.replace("'", '"')
 148
 149         try:
 150             config = json.loads(configJSON)
 151         except (ValueError,) as err:
 152             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 153
 154         playlist = config['playlist']
 155         videoUrl = playlist[1]['url']
 156
 157         info = {
 158             'id': videoId,
 159             'url': videoUrl,
 160             'uploader': showName,
 161             'upload_date': None,
 162             'title': title,
 163             'ext': 'mp4',
 164             'thumbnail': imgUrl,
 165             'description': videoDesc,
 166             'player_url': playerUrl,
 167         }
 168
 169         return [info]
 170
 171 class CollegeHumorIE(InfoExtractor):
 172     """Information extractor for collegehumor.com"""
 173
 174     _WORKING = False
 175     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 176     IE_NAME = u'collegehumor'
 177
 178     def report_manifest(self, video_id):
 179         """Report information extraction."""
 180         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 181
 182     def _real_extract(self, url):
 183         mobj = re.match(self._VALID_URL, url)
 184         if mobj is None:
 185             raise ExtractorError(u'Invalid URL: %s' % url)
 186         video_id = mobj.group('videoid')
 187
 188         info = {
 189             'id': video_id,
 190             'uploader': None,
 191             'upload_date': None,
 192         }
 193
 194         self.report_extraction(video_id)
 195         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 196         try:
 197             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 198         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 199             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 200
 201         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 202         try:
 203             videoNode = mdoc.findall('./video')[0]
 204             info['description'] = videoNode.findall('./description')[0].text
 205             info['title'] = videoNode.findall('./caption')[0].text
 206             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 207             manifest_url = videoNode.findall('./file')[0].text
 208         except IndexError:
 209             raise ExtractorError(u'Invalid metadata XML file')
 210
 211         manifest_url += '?hdcore=2.10.3'
 212         self.report_manifest(video_id)
 213         try:
 214             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 216             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 217
 218         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 219         try:
 220             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 221             node_id = media_node.attrib['url']
 222             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 223         except IndexError as err:
 224             raise ExtractorError(u'Invalid manifest file')
 225
 226         url_pr = compat_urllib_parse_urlparse(manifest_url)
 227         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 228
 229         info['url'] = url
 230         info['ext'] = 'f4f'
 231         return [info]
 232
 233
 234 class XVideosIE(InfoExtractor):
 235     """Information extractor for xvideos.com"""
 236
 237     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 238     IE_NAME = u'xvideos'
 239
 240     def _real_extract(self, url):
 241         mobj = re.match(self._VALID_URL, url)
 242         if mobj is None:
 243             raise ExtractorError(u'Invalid URL: %s' % url)
 244         video_id = mobj.group(1)
 245
 246         webpage = self._download_webpage(url, video_id)
 247
 248         self.report_extraction(video_id)
 249
 250         # Extract video URL
 251         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 252             webpage, u'video URL'))
 253
 254         # Extract title
 255         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 256             webpage, u'title')
 257
 258         # Extract video thumbnail
 259         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 260             webpage, u'thumbnail', fatal=False)
 261
 262         info = {
 263             'id': video_id,
 264             'url': video_url,
 265             'uploader': None,
 266             'upload_date': None,
 267             'title': video_title,
 268             'ext': 'flv',
 269             'thumbnail': video_thumbnail,
 270             'description': None,
 271         }
 272
 273         return [info]
 274
 275
 276
 277
 278 class InfoQIE(InfoExtractor):
 279     """Information extractor for infoq.com"""
 280     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 281
 282     def _real_extract(self, url):
 283         mobj = re.match(self._VALID_URL, url)
 284         if mobj is None:
 285             raise ExtractorError(u'Invalid URL: %s' % url)
 286
 287         webpage = self._download_webpage(url, video_id=url)
 288         self.report_extraction(url)
 289
 290         # Extract video URL
 291         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 292         if mobj is None:
 293             raise ExtractorError(u'Unable to extract video url')
 294         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 295         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 296
 297         # Extract title
 298         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 299             webpage, u'title')
 300
 301         # Extract description
 302         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 303             webpage, u'description', fatal=False)
 304
 305         video_filename = video_url.split('/')[-1]
 306         video_id, extension = video_filename.split('.')
 307
 308         info = {
 309             'id': video_id,
 310             'url': video_url,
 311             'uploader': None,
 312             'upload_date': None,
 313             'title': video_title,
 314             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 315             'thumbnail': None,
 316             'description': video_description,
 317         }
 318
 319         return [info]
 320
 321 class MixcloudIE(InfoExtractor):
 322     """Information extractor for www.mixcloud.com"""
 323
 324     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 325     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 326     IE_NAME = u'mixcloud'
 327
 328     def report_download_json(self, file_id):
 329         """Report JSON download."""
 330         self.to_screen(u'Downloading json')
 331
 332     def get_urls(self, jsonData, fmt, bitrate='best'):
 333         """Get urls from 'audio_formats' section in json"""
 334         file_url = None
 335         try:
 336             bitrate_list = jsonData[fmt]
 337             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 338                 bitrate = max(bitrate_list) # select highest
 339
 340             url_list = jsonData[fmt][bitrate]
 341         except TypeError: # we have no bitrate info.
 342             url_list = jsonData[fmt]
 343         return url_list
 344
 345     def check_urls(self, url_list):
 346         """Returns 1st active url from list"""
 347         for url in url_list:
 348             try:
 349                 compat_urllib_request.urlopen(url)
 350                 return url
 351             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 352                 url = None
 353
 354         return None
 355
 356     def _print_formats(self, formats):
 357         print('Available formats:')
 358         for fmt in formats.keys():
 359             for b in formats[fmt]:
 360                 try:
 361                     ext = formats[fmt][b][0]
 362                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 363                 except TypeError: # we have no bitrate info
 364                     ext = formats[fmt][0]
 365                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 366                     break
 367
 368     def _real_extract(self, url):
 369         mobj = re.match(self._VALID_URL, url)
 370         if mobj is None:
 371             raise ExtractorError(u'Invalid URL: %s' % url)
 372         # extract uploader & filename from url
 373         uploader = mobj.group(1).decode('utf-8')
 374         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 375
 376         # construct API request
 377         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 378         # retrieve .json file with links to files
 379         request = compat_urllib_request.Request(file_url)
 380         try:
 381             self.report_download_json(file_url)
 382             jsonData = compat_urllib_request.urlopen(request).read()
 383         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 384             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 385
 386         # parse JSON
 387         json_data = json.loads(jsonData)
 388         player_url = json_data['player_swf_url']
 389         formats = dict(json_data['audio_formats'])
 390
 391         req_format = self._downloader.params.get('format', None)
 392         bitrate = None
 393
 394         if self._downloader.params.get('listformats', None):
 395             self._print_formats(formats)
 396             return
 397
 398         if req_format is None or req_format == 'best':
 399             for format_param in formats.keys():
 400                 url_list = self.get_urls(formats, format_param)
 401                 # check urls
 402                 file_url = self.check_urls(url_list)
 403                 if file_url is not None:
 404                     break # got it!
 405         else:
 406             if req_format not in formats:
 407                 raise ExtractorError(u'Format is not available')
 408
 409             url_list = self.get_urls(formats, req_format)
 410             file_url = self.check_urls(url_list)
 411             format_param = req_format
 412
 413         return [{
 414             'id': file_id.decode('utf-8'),
 415             'url': file_url.decode('utf-8'),
 416             'uploader': uploader.decode('utf-8'),
 417             'upload_date': None,
 418             'title': json_data['name'],
 419             'ext': file_url.split('.')[-1].decode('utf-8'),
 420             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 421             'thumbnail': json_data['thumbnail_url'],
 422             'description': json_data['description'],
 423             'player_url': player_url.decode('utf-8'),
 424         }]
 425
 426 class StanfordOpenClassroomIE(InfoExtractor):
 427     """Information extractor for Stanford's Open ClassRoom"""
 428
 429     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 430     IE_NAME = u'stanfordoc'
 431
 432     def _real_extract(self, url):
 433         mobj = re.match(self._VALID_URL, url)
 434         if mobj is None:
 435             raise ExtractorError(u'Invalid URL: %s' % url)
 436
 437         if mobj.group('course') and mobj.group('video'): # A specific video
 438             course = mobj.group('course')
 439             video = mobj.group('video')
 440             info = {
 441                 'id': course + '_' + video,
 442                 'uploader': None,
 443                 'upload_date': None,
 444             }
 445
 446             self.report_extraction(info['id'])
 447             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 448             xmlUrl = baseUrl + video + '.xml'
 449             try:
 450                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 451             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 452                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 453             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 454             try:
 455                 info['title'] = mdoc.findall('./title')[0].text
 456                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 457             except IndexError:
 458                 raise ExtractorError(u'Invalid metadata XML file')
 459             info['ext'] = info['url'].rpartition('.')[2]
 460             return [info]
 461         elif mobj.group('course'): # A course page
 462             course = mobj.group('course')
 463             info = {
 464                 'id': course,
 465                 'type': 'playlist',
 466                 'uploader': None,
 467                 'upload_date': None,
 468             }
 469
 470             coursepage = self._download_webpage(url, info['id'],
 471                                         note='Downloading course info page',
 472                                         errnote='Unable to download course info page')
 473
 474             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 475
 476             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 477                 coursepage, u'description', fatal=False)
 478
 479             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 480             info['list'] = [
 481                 {
 482                     'type': 'reference',
 483                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 484                 }
 485                     for vpage in links]
 486             results = []
 487             for entry in info['list']:
 488                 assert entry['type'] == 'reference'
 489                 results += self.extract(entry['url'])
 490             return results
 491         else: # Root page
 492             info = {
 493                 'id': 'Stanford OpenClassroom',
 494                 'type': 'playlist',
 495                 'uploader': None,
 496                 'upload_date': None,
 497             }
 498
 499             self.report_download_webpage(info['id'])
 500             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 501             try:
 502                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 503             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 504                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 505
 506             info['title'] = info['id']
 507
 508             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 509             info['list'] = [
 510                 {
 511                     'type': 'reference',
 512                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 513                 }
 514                     for cpage in links]
 515
 516             results = []
 517             for entry in info['list']:
 518                 assert entry['type'] == 'reference'
 519                 results += self.extract(entry['url'])
 520             return results
 521
 522 class MTVIE(InfoExtractor):
 523     """Information extractor for MTV.com"""
 524
 525     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 526     IE_NAME = u'mtv'
 527
 528     def _real_extract(self, url):
 529         mobj = re.match(self._VALID_URL, url)
 530         if mobj is None:
 531             raise ExtractorError(u'Invalid URL: %s' % url)
 532         if not mobj.group('proto'):
 533             url = 'http://' + url
 534         video_id = mobj.group('videoid')
 535
 536         webpage = self._download_webpage(url, video_id)
 537
 538         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 539             webpage, u'song name', fatal=False)
 540
 541         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 542             webpage, u'title')
 543
 544         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 545             webpage, u'mtvn_uri', fatal=False)
 546
 547         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 548             webpage, u'content id', fatal=False)
 549
 550         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 551         self.report_extraction(video_id)
 552         request = compat_urllib_request.Request(videogen_url)
 553         try:
 554             metadataXml = compat_urllib_request.urlopen(request).read()
 555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 556             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 557
 558         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 559         renditions = mdoc.findall('.//rendition')
 560
 561         # For now, always pick the highest quality.
 562         rendition = renditions[-1]
 563
 564         try:
 565             _,_,ext = rendition.attrib['type'].partition('/')
 566             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 567             video_url = rendition.find('./src').text
 568         except KeyError:
 569             raise ExtractorError('Invalid rendition field.')
 570
 571         info = {
 572             'id': video_id,
 573             'url': video_url,
 574             'uploader': performer,
 575             'upload_date': None,
 576             'title': video_title,
 577             'ext': ext,
 578             'format': format,
 579         }
 580
 581         return [info]
 582
 583
 584 class YoukuIE(InfoExtractor):
 585     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 586
 587     def _gen_sid(self):
 588         nowTime = int(time.time() * 1000)
 589         random1 = random.randint(1000,1998)
 590         random2 = random.randint(1000,9999)
 591
 592         return "%d%d%d" %(nowTime,random1,random2)
 593
 594     def _get_file_ID_mix_string(self, seed):
 595         mixed = []
 596         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 597         seed = float(seed)
 598         for i in range(len(source)):
 599             seed  =  (seed * 211 + 30031 ) % 65536
 600             index  =  math.floor(seed / 65536 * len(source) )
 601             mixed.append(source[int(index)])
 602             source.remove(source[int(index)])
 603         #return ''.join(mixed)
 604         return mixed
 605
 606     def _get_file_id(self, fileId, seed):
 607         mixed = self._get_file_ID_mix_string(seed)
 608         ids = fileId.split('*')
 609         realId = []
 610         for ch in ids:
 611             if ch:
 612                 realId.append(mixed[int(ch)])
 613         return ''.join(realId)
 614
 615     def _real_extract(self, url):
 616         mobj = re.match(self._VALID_URL, url)
 617         if mobj is None:
 618             raise ExtractorError(u'Invalid URL: %s' % url)
 619         video_id = mobj.group('ID')
 620
 621         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 622
 623         jsondata = self._download_webpage(info_url, video_id)
 624
 625         self.report_extraction(video_id)
 626         try:
 627             config = json.loads(jsondata)
 628
 629             video_title =  config['data'][0]['title']
 630             seed = config['data'][0]['seed']
 631
 632             format = self._downloader.params.get('format', None)
 633             supported_format = list(config['data'][0]['streamfileids'].keys())
 634
 635             if format is None or format == 'best':
 636                 if 'hd2' in supported_format:
 637                     format = 'hd2'
 638                 else:
 639                     format = 'flv'
 640                 ext = u'flv'
 641             elif format == 'worst':
 642                 format = 'mp4'
 643                 ext = u'mp4'
 644             else:
 645                 format = 'flv'
 646                 ext = u'flv'
 647
 648
 649             fileid = config['data'][0]['streamfileids'][format]
 650             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 651         except (UnicodeDecodeError, ValueError, KeyError):
 652             raise ExtractorError(u'Unable to extract info section')
 653
 654         files_info=[]
 655         sid = self._gen_sid()
 656         fileid = self._get_file_id(fileid, seed)
 657
 658         #column 8,9 of fileid represent the segment number
 659         #fileid[7:9] should be changed
 660         for index, key in enumerate(keys):
 661
 662             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 663             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 664
 665             info = {
 666                 'id': '%s_part%02d' % (video_id, index),
 667                 'url': download_url,
 668                 'uploader': None,
 669                 'upload_date': None,
 670                 'title': video_title,
 671                 'ext': ext,
 672             }
 673             files_info.append(info)
 674
 675         return files_info
 676
 677
 678 class XNXXIE(InfoExtractor):
 679     """Information extractor for xnxx.com"""
 680
 681     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 682     IE_NAME = u'xnxx'
 683     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 684     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 685     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 686
 687     def _real_extract(self, url):
 688         mobj = re.match(self._VALID_URL, url)
 689         if mobj is None:
 690             raise ExtractorError(u'Invalid URL: %s' % url)
 691         video_id = mobj.group(1)
 692
 693         # Get webpage content
 694         webpage = self._download_webpage(url, video_id)
 695
 696         video_url = self._search_regex(self.VIDEO_URL_RE,
 697             webpage, u'video URL')
 698         video_url = compat_urllib_parse.unquote(video_url)
 699
 700         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 701             webpage, u'title')
 702
 703         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 704             webpage, u'thumbnail', fatal=False)
 705
 706         return [{
 707             'id': video_id,
 708             'url': video_url,
 709             'uploader': None,
 710             'upload_date': None,
 711             'title': video_title,
 712             'ext': 'flv',
 713             'thumbnail': video_thumbnail,
 714             'description': None,
 715         }]
 716
 717
 718
 719 class NBAIE(InfoExtractor):
 720     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 721     IE_NAME = u'nba'
 722
 723     def _real_extract(self, url):
 724         mobj = re.match(self._VALID_URL, url)
 725         if mobj is None:
 726             raise ExtractorError(u'Invalid URL: %s' % url)
 727
 728         video_id = mobj.group(1)
 729
 730         webpage = self._download_webpage(url, video_id)
 731
 732         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 733
 734         shortened_video_id = video_id.rpartition('/')[2]
 735         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 736             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 737
 738         # It isn't there in the HTML it returns to us
 739         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 740
 741         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 742
 743         info = {
 744             'id': shortened_video_id,
 745             'url': video_url,
 746             'ext': 'mp4',
 747             'title': title,
 748             # 'uploader_date': uploader_date,
 749             'description': description,
 750         }
 751         return [info]
 752
 753 class JustinTVIE(InfoExtractor):
 754     """Information extractor for justin.tv and twitch.tv"""
 755     # TODO: One broadcast may be split into multiple videos. The key
 756     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 757     # starts at 1 and increases. Can we treat all parts as one video?
 758
 759     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 760         (?:
 761             (?P<channelid>[^/]+)|
 762             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 763             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 764         )
 765         /?(?:\#.*)?$
 766         """
 767     _JUSTIN_PAGE_LIMIT = 100
 768     IE_NAME = u'justin.tv'
 769
 770     def report_download_page(self, channel, offset):
 771         """Report attempt to download a single page of videos."""
 772         self.to_screen(u'%s: Downloading video information from %d to %d' %
 773                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 774
 775     # Return count of items, list of *valid* items
 776     def _parse_page(self, url, video_id):
 777         webpage = self._download_webpage(url, video_id,
 778                                          u'Downloading video info JSON',
 779                                          u'unable to download video info JSON')
 780
 781         response = json.loads(webpage)
 782         if type(response) != list:
 783             error_text = response.get('error', 'unknown error')
 784             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 785         info = []
 786         for clip in response:
 787             video_url = clip['video_file_url']
 788             if video_url:
 789                 video_extension = os.path.splitext(video_url)[1][1:]
 790                 video_date = re.sub('-', '', clip['start_time'][:10])
 791                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 792                 video_id = clip['id']
 793                 video_title = clip.get('title', video_id)
 794                 info.append({
 795                     'id': video_id,
 796                     'url': video_url,
 797                     'title': video_title,
 798                     'uploader': clip.get('channel_name', video_uploader_id),
 799                     'uploader_id': video_uploader_id,
 800                     'upload_date': video_date,
 801                     'ext': video_extension,
 802                 })
 803         return (len(response), info)
 804
 805     def _real_extract(self, url):
 806         mobj = re.match(self._VALID_URL, url)
 807         if mobj is None:
 808             raise ExtractorError(u'invalid URL: %s' % url)
 809
 810         api_base = 'http://api.justin.tv'
 811         paged = False
 812         if mobj.group('channelid'):
 813             paged = True
 814             video_id = mobj.group('channelid')
 815             api = api_base + '/channel/archives/%s.json' % video_id
 816         elif mobj.group('chapterid'):
 817             chapter_id = mobj.group('chapterid')
 818
 819             webpage = self._download_webpage(url, chapter_id)
 820             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 821             if not m:
 822                 raise ExtractorError(u'Cannot find archive of a chapter')
 823             archive_id = m.group(1)
 824
 825             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 826             chapter_info_xml = self._download_webpage(api, chapter_id,
 827                                              note=u'Downloading chapter information',
 828                                              errnote=u'Chapter information download failed')
 829             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 830             for a in doc.findall('.//archive'):
 831                 if archive_id == a.find('./id').text:
 832                     break
 833             else:
 834                 raise ExtractorError(u'Could not find chapter in chapter information')
 835
 836             video_url = a.find('./video_file_url').text
 837             video_ext = video_url.rpartition('.')[2] or u'flv'
 838
 839             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 840             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 841                                    note='Downloading chapter metadata',
 842                                    errnote='Download of chapter metadata failed')
 843             chapter_info = json.loads(chapter_info_json)
 844
 845             bracket_start = int(doc.find('.//bracket_start').text)
 846             bracket_end = int(doc.find('.//bracket_end').text)
 847
 848             # TODO determine start (and probably fix up file)
 849             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 850             #video_url += u'?start=' + TODO:start_timestamp
 851             # bracket_start is 13290, but we want 51670615
 852             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 853                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 854
 855             info = {
 856                 'id': u'c' + chapter_id,
 857                 'url': video_url,
 858                 'ext': video_ext,
 859                 'title': chapter_info['title'],
 860                 'thumbnail': chapter_info['preview'],
 861                 'description': chapter_info['description'],
 862                 'uploader': chapter_info['channel']['display_name'],
 863                 'uploader_id': chapter_info['channel']['name'],
 864             }
 865             return [info]
 866         else:
 867             video_id = mobj.group('videoid')
 868             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 869
 870         self.report_extraction(video_id)
 871
 872         info = []
 873         offset = 0
 874         limit = self._JUSTIN_PAGE_LIMIT
 875         while True:
 876             if paged:
 877                 self.report_download_page(video_id, offset)
 878             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 879             page_count, page_info = self._parse_page(page_url, video_id)
 880             info.extend(page_info)
 881             if not paged or page_count != limit:
 882                 break
 883             offset += limit
 884         return info
 885
 886 class FunnyOrDieIE(InfoExtractor):
 887     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 888
 889     def _real_extract(self, url):
 890         mobj = re.match(self._VALID_URL, url)
 891         if mobj is None:
 892             raise ExtractorError(u'invalid URL: %s' % url)
 893
 894         video_id = mobj.group('id')
 895         webpage = self._download_webpage(url, video_id)
 896
 897         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 898             webpage, u'video URL', flags=re.DOTALL)
 899
 900         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 901             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 902
 903         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 904             webpage, u'description', fatal=False, flags=re.DOTALL)
 905
 906         info = {
 907             'id': video_id,
 908             'url': video_url,
 909             'ext': 'mp4',
 910             'title': title,
 911             'description': video_description,
 912         }
 913         return [info]
 914
 915 class SteamIE(InfoExtractor):
 916     _VALID_URL = r"""http://store\.steampowered\.com/
 917                 (agecheck/)?
 918                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 919                 (?P<gameID>\d+)/?
 920                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 921                 """
 922     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 923     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 924
 925     @classmethod
 926     def suitable(cls, url):
 927         """Receives a URL and returns True if suitable for this IE."""
 928         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 929
 930     def _real_extract(self, url):
 931         m = re.match(self._VALID_URL, url, re.VERBOSE)
 932         gameID = m.group('gameID')
 933
 934         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 935         webpage = self._download_webpage(videourl, gameID)
 936
 937         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 938             videourl = self._AGECHECK_TEMPLATE % gameID
 939             self.report_age_confirmation()
 940             webpage = self._download_webpage(videourl, gameID)
 941
 942         self.report_extraction(gameID)
 943         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 944                                              webpage, 'game title')
 945
 946         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 947         mweb = re.finditer(urlRE, webpage)
 948         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 949         titles = re.finditer(namesRE, webpage)
 950         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 951         thumbs = re.finditer(thumbsRE, webpage)
 952         videos = []
 953         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 954             video_id = vid.group('videoID')
 955             title = vtitle.group('videoName')
 956             video_url = vid.group('videoURL')
 957             video_thumb = thumb.group('thumbnail')
 958             if not video_url:
 959                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 960             info = {
 961                 'id':video_id,
 962                 'url':video_url,
 963                 'ext': 'flv',
 964                 'title': unescapeHTML(title),
 965                 'thumbnail': video_thumb
 966                   }
 967             videos.append(info)
 968         return [self.playlist_result(videos, gameID, game_title)]
 969
 970 class UstreamIE(InfoExtractor):
 971     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 972     IE_NAME = u'ustream'
 973
 974     def _real_extract(self, url):
 975         m = re.match(self._VALID_URL, url)
 976         video_id = m.group('videoID')
 977
 978         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 979         webpage = self._download_webpage(url, video_id)
 980
 981         self.report_extraction(video_id)
 982
 983         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 984             webpage, u'title')
 985
 986         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 987             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 988
 989         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 990             webpage, u'thumbnail', fatal=False)
 991
 992         info = {
 993                 'id': video_id,
 994                 'url': video_url,
 995                 'ext': 'flv',
 996                 'title': video_title,
 997                 'uploader': uploader,
 998                 'thumbnail': thumbnail,
 999                }
1000         return info
1001
1002 class WorldStarHipHopIE(InfoExtractor):
1003     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1004     IE_NAME = u'WorldStarHipHop'
1005
1006     def _real_extract(self, url):
1007         m = re.match(self._VALID_URL, url)
1008         video_id = m.group('id')
1009
1010         webpage_src = self._download_webpage(url, video_id)
1011
1012         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1013             webpage_src, u'video URL')
1014
1015         if 'mp4' in video_url:
1016             ext = 'mp4'
1017         else:
1018             ext = 'flv'
1019
1020         video_title = self._html_search_regex(r"<title>(.*)</title>",
1021             webpage_src, u'title')
1022
1023         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1024         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1025             webpage_src, u'thumbnail', fatal=False)
1026
1027         if not thumbnail:
1028             _title = r"""candytitles.*>(.*)</span>"""
1029             mobj = re.search(_title, webpage_src)
1030             if mobj is not None:
1031                 video_title = mobj.group(1)
1032
1033         results = [{
1034                     'id': video_id,
1035                     'url' : video_url,
1036                     'title' : video_title,
1037                     'thumbnail' : thumbnail,
1038                     'ext' : ext,
1039                     }]
1040         return results
1041
1042 class RBMARadioIE(InfoExtractor):
1043     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1044
1045     def _real_extract(self, url):
1046         m = re.match(self._VALID_URL, url)
1047         video_id = m.group('videoID')
1048
1049         webpage = self._download_webpage(url, video_id)
1050
1051         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1052             webpage, u'json data', flags=re.MULTILINE)
1053
1054         try:
1055             data = json.loads(json_data)
1056         except ValueError as e:
1057             raise ExtractorError(u'Invalid JSON: ' + str(e))
1058
1059         video_url = data['akamai_url'] + '&cbr=256'
1060         url_parts = compat_urllib_parse_urlparse(video_url)
1061         video_ext = url_parts.path.rpartition('.')[2]
1062         info = {
1063                 'id': video_id,
1064                 'url': video_url,
1065                 'ext': video_ext,
1066                 'title': data['title'],
1067                 'description': data.get('teaser_text'),
1068                 'location': data.get('country_of_origin'),
1069                 'uploader': data.get('host', {}).get('name'),
1070                 'uploader_id': data.get('host', {}).get('slug'),
1071                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1072                 'duration': data.get('duration'),
1073         }
1074         return [info]
1075
1076
1077 class YouPornIE(InfoExtractor):
1078     """Information extractor for youporn.com."""
1079     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1080
1081     def _print_formats(self, formats):
1082         """Print all available formats"""
1083         print(u'Available formats:')
1084         print(u'ext\t\tformat')
1085         print(u'---------------------------------')
1086         for format in formats:
1087             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1088
1089     def _specific(self, req_format, formats):
1090         for x in formats:
1091             if(x["format"]==req_format):
1092                 return x
1093         return None
1094
1095     def _real_extract(self, url):
1096         mobj = re.match(self._VALID_URL, url)
1097         if mobj is None:
1098             raise ExtractorError(u'Invalid URL: %s' % url)
1099         video_id = mobj.group('videoid')
1100
1101         req = compat_urllib_request.Request(url)
1102         req.add_header('Cookie', 'age_verified=1')
1103         webpage = self._download_webpage(req, video_id)
1104
1105         # Get JSON parameters
1106         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1107         try:
1108             params = json.loads(json_params)
1109         except:
1110             raise ExtractorError(u'Invalid JSON')
1111
1112         self.report_extraction(video_id)
1113         try:
1114             video_title = params['title']
1115             upload_date = unified_strdate(params['release_date_f'])
1116             video_description = params['description']
1117             video_uploader = params['submitted_by']
1118             thumbnail = params['thumbnails'][0]['image']
1119         except KeyError:
1120             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1121
1122         # Get all of the formats available
1123         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1124         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1125             webpage, u'download list').strip()
1126
1127         # Get all of the links from the page
1128         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1129         links = re.findall(LINK_RE, download_list_html)
1130         if(len(links) == 0):
1131             raise ExtractorError(u'ERROR: no known formats available for video')
1132
1133         self.to_screen(u'Links found: %d' % len(links))
1134
1135         formats = []
1136         for link in links:
1137
1138             # A link looks like this:
1139             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1140             # A path looks like this:
1141             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1142             video_url = unescapeHTML( link )
1143             path = compat_urllib_parse_urlparse( video_url ).path
1144             extension = os.path.splitext( path )[1][1:]
1145             format = path.split('/')[4].split('_')[:2]
1146             size = format[0]
1147             bitrate = format[1]
1148             format = "-".join( format )
1149             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1150
1151             formats.append({
1152                 'id': video_id,
1153                 'url': video_url,
1154                 'uploader': video_uploader,
1155                 'upload_date': upload_date,
1156                 'title': video_title,
1157                 'ext': extension,
1158                 'format': format,
1159                 'thumbnail': thumbnail,
1160                 'description': video_description
1161             })
1162
1163         if self._downloader.params.get('listformats', None):
1164             self._print_formats(formats)
1165             return
1166
1167         req_format = self._downloader.params.get('format', None)
1168         self.to_screen(u'Format: %s' % req_format)
1169
1170         if req_format is None or req_format == 'best':
1171             return [formats[0]]
1172         elif req_format == 'worst':
1173             return [formats[-1]]
1174         elif req_format in ('-1', 'all'):
1175             return formats
1176         else:
1177             format = self._specific( req_format, formats )
1178             if result is None:
1179                 raise ExtractorError(u'Requested format not available')
1180             return [format]
1181
1182
1183
1184 class PornotubeIE(InfoExtractor):
1185     """Information extractor for pornotube.com."""
1186     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1187
1188     def _real_extract(self, url):
1189         mobj = re.match(self._VALID_URL, url)
1190         if mobj is None:
1191             raise ExtractorError(u'Invalid URL: %s' % url)
1192
1193         video_id = mobj.group('videoid')
1194         video_title = mobj.group('title')
1195
1196         # Get webpage content
1197         webpage = self._download_webpage(url, video_id)
1198
1199         # Get the video URL
1200         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1201         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1202         video_url = compat_urllib_parse.unquote(video_url)
1203
1204         #Get the uploaded date
1205         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1206         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1207         if upload_date: upload_date = unified_strdate(upload_date)
1208
1209         info = {'id': video_id,
1210                 'url': video_url,
1211                 'uploader': None,
1212                 'upload_date': upload_date,
1213                 'title': video_title,
1214                 'ext': 'flv',
1215                 'format': 'flv'}
1216
1217         return [info]
1218
1219 class YouJizzIE(InfoExtractor):
1220     """Information extractor for youjizz.com."""
1221     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1222
1223     def _real_extract(self, url):
1224         mobj = re.match(self._VALID_URL, url)
1225         if mobj is None:
1226             raise ExtractorError(u'Invalid URL: %s' % url)
1227
1228         video_id = mobj.group('videoid')
1229
1230         # Get webpage content
1231         webpage = self._download_webpage(url, video_id)
1232
1233         # Get the video title
1234         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1235             webpage, u'title').strip()
1236
1237         # Get the embed page
1238         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1239         if result is None:
1240             raise ExtractorError(u'ERROR: unable to extract embed page')
1241
1242         embed_page_url = result.group(0).strip()
1243         video_id = result.group('videoid')
1244
1245         webpage = self._download_webpage(embed_page_url, video_id)
1246
1247         # Get the video URL
1248         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1249             webpage, u'video URL')
1250
1251         info = {'id': video_id,
1252                 'url': video_url,
1253                 'title': video_title,
1254                 'ext': 'flv',
1255                 'format': 'flv',
1256                 'player_url': embed_page_url}
1257
1258         return [info]
1259
1260 class EightTracksIE(InfoExtractor):
1261     IE_NAME = '8tracks'
1262     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1263
1264     def _real_extract(self, url):
1265         mobj = re.match(self._VALID_URL, url)
1266         if mobj is None:
1267             raise ExtractorError(u'Invalid URL: %s' % url)
1268         playlist_id = mobj.group('id')
1269
1270         webpage = self._download_webpage(url, playlist_id)
1271
1272         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1273         data = json.loads(json_like)
1274
1275         session = str(random.randint(0, 1000000000))
1276         mix_id = data['id']
1277         track_count = data['tracks_count']
1278         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1279         next_url = first_url
1280         res = []
1281         for i in itertools.count():
1282             api_json = self._download_webpage(next_url, playlist_id,
1283                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1284                 errnote=u'Failed to download song information')
1285             api_data = json.loads(api_json)
1286             track_data = api_data[u'set']['track']
1287             info = {
1288                 'id': track_data['id'],
1289                 'url': track_data['track_file_stream_url'],
1290                 'title': track_data['performer'] + u' - ' + track_data['name'],
1291                 'raw_title': track_data['name'],
1292                 'uploader_id': data['user']['login'],
1293                 'ext': 'm4a',
1294             }
1295             res.append(info)
1296             if api_data['set']['at_last_track']:
1297                 break
1298             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1299         return res
1300
1301 class KeekIE(InfoExtractor):
1302     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1303     IE_NAME = u'keek'
1304
1305     def _real_extract(self, url):
1306         m = re.match(self._VALID_URL, url)
1307         video_id = m.group('videoID')
1308
1309         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1310         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1311         webpage = self._download_webpage(url, video_id)
1312
1313         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1314             webpage, u'title')
1315
1316         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1317             webpage, u'uploader', fatal=False)
1318
1319         info = {
1320                 'id': video_id,
1321                 'url': video_url,
1322                 'ext': 'mp4',
1323                 'title': video_title,
1324                 'thumbnail': thumbnail,
1325                 'uploader': uploader
1326         }
1327         return [info]
1328
1329 class TEDIE(InfoExtractor):
1330     _VALID_URL=r'''http://www\.ted\.com/
1331                    (
1332                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1333                         |
1334                         ((?P<type_talk>talks)) # We have a simple talk
1335                    )
1336                    (/lang/(.*?))? # The url may contain the language
1337                    /(?P<name>\w+) # Here goes the name and then ".html"
1338                    '''
1339
1340     @classmethod
1341     def suitable(cls, url):
1342         """Receives a URL and returns True if suitable for this IE."""
1343         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1344
1345     def _real_extract(self, url):
1346         m=re.match(self._VALID_URL, url, re.VERBOSE)
1347         if m.group('type_talk'):
1348             return [self._talk_info(url)]
1349         else :
1350             playlist_id=m.group('playlist_id')
1351             name=m.group('name')
1352             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1353             return [self._playlist_videos_info(url,name,playlist_id)]
1354
1355     def _playlist_videos_info(self,url,name,playlist_id=0):
1356         '''Returns the videos of the playlist'''
1357         video_RE=r'''
1358                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1359                      ([.\s]*?)data-playlist_item_id="(\d+)"
1360                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1361                      '''
1362         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1363         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1364         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1365         m_names=re.finditer(video_name_RE,webpage)
1366
1367         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1368                                                  webpage, 'playlist title')
1369
1370         playlist_entries = []
1371         for m_video, m_name in zip(m_videos,m_names):
1372             video_id=m_video.group('video_id')
1373             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1374             playlist_entries.append(self.url_result(talk_url, 'TED'))
1375         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1376
1377     def _talk_info(self, url, video_id=0):
1378         """Return the video for the talk in the url"""
1379         m = re.match(self._VALID_URL, url,re.VERBOSE)
1380         video_name = m.group('name')
1381         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1382         self.report_extraction(video_name)
1383         # If the url includes the language we get the title translated
1384         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1385                                         webpage, 'title')
1386         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1387                                     webpage, 'json data')
1388         info = json.loads(json_data)
1389         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1390                                        webpage, 'description', flags = re.DOTALL)
1391
1392         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1393                                        webpage, 'thumbnail')
1394         info = {
1395                 'id': info['id'],
1396                 'url': info['htmlStreams'][-1]['file'],
1397                 'ext': 'mp4',
1398                 'title': title,
1399                 'thumbnail': thumbnail,
1400                 'description': desc,
1401                 }
1402         return info
1403
1404 class MySpassIE(InfoExtractor):
1405     _VALID_URL = r'http://www.myspass.de/.*'
1406
1407     def _real_extract(self, url):
1408         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1409
1410         # video id is the last path element of the URL
1411         # usually there is a trailing slash, so also try the second but last
1412         url_path = compat_urllib_parse_urlparse(url).path
1413         url_parent_path, video_id = os.path.split(url_path)
1414         if not video_id:
1415             _, video_id = os.path.split(url_parent_path)
1416
1417         # get metadata
1418         metadata_url = META_DATA_URL_TEMPLATE % video_id
1419         metadata_text = self._download_webpage(metadata_url, video_id)
1420         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1421
1422         # extract values from metadata
1423         url_flv_el = metadata.find('url_flv')
1424         if url_flv_el is None:
1425             raise ExtractorError(u'Unable to extract download url')
1426         video_url = url_flv_el.text
1427         extension = os.path.splitext(video_url)[1][1:]
1428         title_el = metadata.find('title')
1429         if title_el is None:
1430             raise ExtractorError(u'Unable to extract title')
1431         title = title_el.text
1432         format_id_el = metadata.find('format_id')
1433         if format_id_el is None:
1434             format = ext
1435         else:
1436             format = format_id_el.text
1437         description_el = metadata.find('description')
1438         if description_el is not None:
1439             description = description_el.text
1440         else:
1441             description = None
1442         imagePreview_el = metadata.find('imagePreview')
1443         if imagePreview_el is not None:
1444             thumbnail = imagePreview_el.text
1445         else:
1446             thumbnail = None
1447         info = {
1448             'id': video_id,
1449             'url': video_url,
1450             'title': title,
1451             'ext': extension,
1452             'format': format,
1453             'thumbnail': thumbnail,
1454             'description': description
1455         }
1456         return [info]
1457
1458 class SpiegelIE(InfoExtractor):
1459     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1460
1461     def _real_extract(self, url):
1462         m = re.match(self._VALID_URL, url)
1463         video_id = m.group('videoID')
1464
1465         webpage = self._download_webpage(url, video_id)
1466
1467         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1468             webpage, u'title')
1469
1470         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1471         xml_code = self._download_webpage(xml_url, video_id,
1472                     note=u'Downloading XML', errnote=u'Failed to download XML')
1473
1474         idoc = xml.etree.ElementTree.fromstring(xml_code)
1475         last_type = idoc[-1]
1476         filename = last_type.findall('./filename')[0].text
1477         duration = float(last_type.findall('./duration')[0].text)
1478
1479         video_url = 'http://video2.spiegel.de/flash/' + filename
1480         video_ext = filename.rpartition('.')[2]
1481         info = {
1482             'id': video_id,
1483             'url': video_url,
1484             'ext': video_ext,
1485             'title': video_title,
1486             'duration': duration,
1487         }
1488         return [info]
1489
1490 class LiveLeakIE(InfoExtractor):
1491
1492     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1493     IE_NAME = u'liveleak'
1494
1495     def _real_extract(self, url):
1496         mobj = re.match(self._VALID_URL, url)
1497         if mobj is None:
1498             raise ExtractorError(u'Invalid URL: %s' % url)
1499
1500         video_id = mobj.group('video_id')
1501
1502         webpage = self._download_webpage(url, video_id)
1503
1504         video_url = self._search_regex(r'file: "(.*?)",',
1505             webpage, u'video URL')
1506
1507         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1508             webpage, u'title').replace('LiveLeak.com -', '').strip()
1509
1510         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1511             webpage, u'description', fatal=False)
1512
1513         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1514             webpage, u'uploader', fatal=False)
1515
1516         info = {
1517             'id':  video_id,
1518             'url': video_url,
1519             'ext': 'mp4',
1520             'title': video_title,
1521             'description': video_description,
1522             'uploader': video_uploader
1523         }
1524
1525         return [info]
1526
1527
1528
1529 class TumblrIE(InfoExtractor):
1530     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1531
1532     def _real_extract(self, url):
1533         m_url = re.match(self._VALID_URL, url)
1534         video_id = m_url.group('id')
1535         blog = m_url.group('blog_name')
1536
1537         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1538         webpage = self._download_webpage(url, video_id)
1539
1540         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1541         video = re.search(re_video, webpage)
1542         if video is None:
1543            raise ExtractorError(u'Unable to extract video')
1544         video_url = video.group('video_url')
1545         ext = video.group('ext')
1546
1547         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1548             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1549         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1550
1551         # The only place where you can get a title, it's not complete,
1552         # but searching in other places doesn't work for all videos
1553         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1554             webpage, u'title', flags=re.DOTALL)
1555
1556         return [{'id': video_id,
1557                  'url': video_url,
1558                  'title': video_title,
1559                  'thumbnail': video_thumbnail,
1560                  'ext': ext
1561                  }]
1562
1563 class BandcampIE(InfoExtractor):
1564     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1565
1566     def _real_extract(self, url):
1567         mobj = re.match(self._VALID_URL, url)
1568         title = mobj.group('title')
1569         webpage = self._download_webpage(url, title)
1570         # We get the link to the free download page
1571         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1572         if m_download is None:
1573             raise ExtractorError(u'No free songs found')
1574
1575         download_link = m_download.group(1)
1576         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1577                        webpage, re.MULTILINE|re.DOTALL).group('id')
1578
1579         download_webpage = self._download_webpage(download_link, id,
1580                                                   'Downloading free downloads page')
1581         # We get the dictionary of the track from some javascrip code
1582         info = re.search(r'items: (.*?),$',
1583                          download_webpage, re.MULTILINE).group(1)
1584         info = json.loads(info)[0]
1585         # We pick mp3-320 for now, until format selection can be easily implemented.
1586         mp3_info = info[u'downloads'][u'mp3-320']
1587         # If we try to use this url it says the link has expired
1588         initial_url = mp3_info[u'url']
1589         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1590         m_url = re.match(re_url, initial_url)
1591         #We build the url we will use to get the final track url
1592         # This url is build in Bandcamp in the script download_bunde_*.js
1593         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1594         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1595         # If we could correctly generate the .rand field the url would be
1596         #in the "download_url" key
1597         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1598
1599         track_info = {'id':id,
1600                       'title' : info[u'title'],
1601                       'ext' :   'mp3',
1602                       'url' :   final_url,
1603                       'thumbnail' : info[u'thumb_url'],
1604                       'uploader' :  info[u'artist']
1605                       }
1606
1607         return [track_info]
1608
1609 class RedTubeIE(InfoExtractor):
1610     """Information Extractor for redtube"""
1611     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1612
1613     def _real_extract(self,url):
1614         mobj = re.match(self._VALID_URL, url)
1615         if mobj is None:
1616             raise ExtractorError(u'Invalid URL: %s' % url)
1617
1618         video_id = mobj.group('id')
1619         video_extension = 'mp4'
1620         webpage = self._download_webpage(url, video_id)
1621
1622         self.report_extraction(video_id)
1623
1624         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1625             webpage, u'video URL')
1626
1627         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1628             webpage, u'title')
1629
1630         return [{
1631             'id':       video_id,
1632             'url':      video_url,
1633             'ext':      video_extension,
1634             'title':    video_title,
1635         }]
1636
1637 class InaIE(InfoExtractor):
1638     """Information Extractor for Ina.fr"""
1639     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1640
1641     def _real_extract(self,url):
1642         mobj = re.match(self._VALID_URL, url)
1643
1644         video_id = mobj.group('id')
1645         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1646         video_extension = 'mp4'
1647         webpage = self._download_webpage(mrss_url, video_id)
1648
1649         self.report_extraction(video_id)
1650
1651         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1652             webpage, u'video URL')
1653
1654         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1655             webpage, u'title')
1656
1657         return [{
1658             'id':       video_id,
1659             'url':      video_url,
1660             'ext':      video_extension,
1661             'title':    video_title,
1662         }]
1663
1664 class HowcastIE(InfoExtractor):
1665     """Information Extractor for Howcast.com"""
1666     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1667
1668     def _real_extract(self, url):
1669         mobj = re.match(self._VALID_URL, url)
1670
1671         video_id = mobj.group('id')
1672         webpage_url = 'http://www.howcast.com/videos/' + video_id
1673         webpage = self._download_webpage(webpage_url, video_id)
1674
1675         self.report_extraction(video_id)
1676
1677         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1678             webpage, u'video URL')
1679
1680         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1681             webpage, u'title')
1682
1683         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1684             webpage, u'description', fatal=False)
1685
1686         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1687             webpage, u'thumbnail', fatal=False)
1688
1689         return [{
1690             'id':       video_id,
1691             'url':      video_url,
1692             'ext':      'mp4',
1693             'title':    video_title,
1694             'description': video_description,
1695             'thumbnail': thumbnail,
1696         }]
1697
1698 class VineIE(InfoExtractor):
1699     """Information Extractor for Vine.co"""
1700     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1701
1702     def _real_extract(self, url):
1703         mobj = re.match(self._VALID_URL, url)
1704
1705         video_id = mobj.group('id')
1706         webpage_url = 'https://vine.co/v/' + video_id
1707         webpage = self._download_webpage(webpage_url, video_id)
1708
1709         self.report_extraction(video_id)
1710
1711         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1712             webpage, u'video URL')
1713
1714         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1715             webpage, u'title')
1716
1717         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1718             webpage, u'thumbnail', fatal=False)
1719
1720         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1721             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1722
1723         return [{
1724             'id':        video_id,
1725             'url':       video_url,
1726             'ext':       'mp4',
1727             'title':     video_title,
1728             'thumbnail': thumbnail,
1729             'uploader':  uploader,
1730         }]
1731
1732 class FlickrIE(InfoExtractor):
1733     """Information Extractor for Flickr videos"""
1734     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1735
1736     def _real_extract(self, url):
1737         mobj = re.match(self._VALID_URL, url)
1738
1739         video_id = mobj.group('id')
1740         video_uploader_id = mobj.group('uploader_id')
1741         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1742         webpage = self._download_webpage(webpage_url, video_id)
1743
1744         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1745
1746         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1747         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1748
1749         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1750             first_xml, u'node_id')
1751
1752         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1753         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1754
1755         self.report_extraction(video_id)
1756
1757         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1758         if mobj is None:
1759             raise ExtractorError(u'Unable to extract video url')
1760         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1761
1762         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1763             webpage, u'video title')
1764
1765         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1766             webpage, u'description', fatal=False)
1767
1768         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1769             webpage, u'thumbnail', fatal=False)
1770
1771         return [{
1772             'id':          video_id,
1773             'url':         video_url,
1774             'ext':         'mp4',
1775             'title':       video_title,
1776             'description': video_description,
1777             'thumbnail':   thumbnail,
1778             'uploader_id': video_uploader_id,
1779         }]
1780
1781 class TeamcocoIE(InfoExtractor):
1782     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1783
1784     def _real_extract(self, url):
1785         mobj = re.match(self._VALID_URL, url)
1786         if mobj is None:
1787             raise ExtractorError(u'Invalid URL: %s' % url)
1788         url_title = mobj.group('url_title')
1789         webpage = self._download_webpage(url, url_title)
1790
1791         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1792             webpage, u'video id')
1793
1794         self.report_extraction(video_id)
1795
1796         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1797             webpage, u'title')
1798
1799         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1800             webpage, u'thumbnail', fatal=False)
1801
1802         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1803             webpage, u'description', fatal=False)
1804
1805         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1806         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1807
1808         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1809             data, u'video URL')
1810
1811         return [{
1812             'id':          video_id,
1813             'url':         video_url,
1814             'ext':         'mp4',
1815             'title':       video_title,
1816             'thumbnail':   thumbnail,
1817             'description': video_description,
1818         }]
1819
1820 class XHamsterIE(InfoExtractor):
1821     """Information Extractor for xHamster"""
1822     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1823
1824     def _real_extract(self,url):
1825         mobj = re.match(self._VALID_URL, url)
1826
1827         video_id = mobj.group('id')
1828         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1829         webpage = self._download_webpage(mrss_url, video_id)
1830
1831         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1832         if mobj is None:
1833             raise ExtractorError(u'Unable to extract media URL')
1834         if len(mobj.group('server')) == 0:
1835             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1836         else:
1837             video_url = mobj.group('server')+'/key='+mobj.group('file')
1838         video_extension = video_url.split('.')[-1]
1839
1840         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1841             webpage, u'title')
1842
1843         # Can't see the description anywhere in the UI
1844         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1845         #     webpage, u'description', fatal=False)
1846         # if video_description: video_description = unescapeHTML(video_description)
1847
1848         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1849         if mobj:
1850             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1851         else:
1852             video_upload_date = None
1853             self._downloader.report_warning(u'Unable to extract upload date')
1854
1855         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1856             webpage, u'uploader id', default=u'anonymous')
1857
1858         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1859             webpage, u'thumbnail', fatal=False)
1860
1861         return [{
1862             'id':       video_id,
1863             'url':      video_url,
1864             'ext':      video_extension,
1865             'title':    video_title,
1866             # 'description': video_description,
1867             'upload_date': video_upload_date,
1868             'uploader_id': video_uploader_id,
1869             'thumbnail': video_thumbnail
1870         }]
1871
1872 class HypemIE(InfoExtractor):
1873     """Information Extractor for hypem"""
1874     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1875
1876     def _real_extract(self, url):
1877         mobj = re.match(self._VALID_URL, url)
1878         if mobj is None:
1879             raise ExtractorError(u'Invalid URL: %s' % url)
1880         track_id = mobj.group(1)
1881
1882         data = { 'ax': 1, 'ts': time.time() }
1883         data_encoded = compat_urllib_parse.urlencode(data)
1884         complete_url = url + "?" + data_encoded
1885         request = compat_urllib_request.Request(complete_url)
1886         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1887         cookie = urlh.headers.get('Set-Cookie', '')
1888
1889         self.report_extraction(track_id)
1890
1891         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1892             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1893         try:
1894             track_list = json.loads(html_tracks)
1895             track = track_list[u'tracks'][0]
1896         except ValueError:
1897             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1898
1899         key = track[u"key"]
1900         track_id = track[u"id"]
1901         artist = track[u"artist"]
1902         title = track[u"song"]
1903
1904         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1905         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1906         request.add_header('cookie', cookie)
1907         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1908         try:
1909             song_data = json.loads(song_data_json)
1910         except ValueError:
1911             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1912         final_url = song_data[u"url"]
1913
1914         return [{
1915             'id':       track_id,
1916             'url':      final_url,
1917             'ext':      "mp3",
1918             'title':    title,
1919             'artist':   artist,
1920         }]
1921
1922 class Vbox7IE(InfoExtractor):
1923     """Information Extractor for Vbox7"""
1924     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1925
1926     def _real_extract(self,url):
1927         mobj = re.match(self._VALID_URL, url)
1928         if mobj is None:
1929             raise ExtractorError(u'Invalid URL: %s' % url)
1930         video_id = mobj.group(1)
1931
1932         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1933         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1934         redirect_url = urlh.geturl() + new_location
1935         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1936
1937         title = self._html_search_regex(r'<title>(.*)</title>',
1938             webpage, u'title').split('/')[0].strip()
1939
1940         ext = "flv"
1941         info_url = "http://vbox7.com/play/magare.do"
1942         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1943         info_request = compat_urllib_request.Request(info_url, data)
1944         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1945         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1946         if info_response is None:
1947             raise ExtractorError(u'Unable to extract the media url')
1948         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1949
1950         return [{
1951             'id':        video_id,
1952             'url':       final_url,
1953             'ext':       ext,
1954             'title':     title,
1955             'thumbnail': thumbnail_url,
1956         }]
1957
1958
1959 def gen_extractors():
1960     """ Return a list of an instance of every supported extractor.
1961     The order does matter; the first extractor matched is the one handling the URL.
1962     """
1963     return [
1964         YoutubePlaylistIE(),
1965         YoutubeChannelIE(),
1966         YoutubeUserIE(),
1967         YoutubeSearchIE(),
1968         YoutubeIE(),
1969         MetacafeIE(),
1970         DailymotionIE(),
1971         GoogleSearchIE(),
1972         PhotobucketIE(),
1973         YahooIE(),
1974         YahooSearchIE(),
1975         DepositFilesIE(),
1976         FacebookIE(),
1977         BlipTVIE(),
1978         BlipTVUserIE(),
1979         VimeoIE(),
1980         MyVideoIE(),
1981         ComedyCentralIE(),
1982         EscapistIE(),
1983         CollegeHumorIE(),
1984         XVideosIE(),
1985         SoundcloudSetIE(),
1986         SoundcloudIE(),
1987         InfoQIE(),
1988         MixcloudIE(),
1989         StanfordOpenClassroomIE(),
1990         MTVIE(),
1991         YoukuIE(),
1992         XNXXIE(),
1993         YouJizzIE(),
1994         PornotubeIE(),
1995         YouPornIE(),
1996         GooglePlusIE(),
1997         ArteTvIE(),
1998         NBAIE(),
1999         WorldStarHipHopIE(),
2000         JustinTVIE(),
2001         FunnyOrDieIE(),
2002         SteamIE(),
2003         UstreamIE(),
2004         RBMARadioIE(),
2005         EightTracksIE(),
2006         KeekIE(),
2007         TEDIE(),
2008         MySpassIE(),
2009         SpiegelIE(),
2010         LiveLeakIE(),
2011         ARDIE(),
2012         ZDFIE(),
2013         TumblrIE(),
2014         BandcampIE(),
2015         RedTubeIE(),
2016         InaIE(),
2017         HowcastIE(),
2018         VineIE(),
2019         FlickrIE(),
2020         TeamcocoIE(),
2021         XHamsterIE(),
2022         HypemIE(),
2023         Vbox7IE(),
2024         GametrailersIE(),
2025         StatigramIE(),
2026         GenericIE()
2027     ]
2028
2029 def get_info_extractor(ie_name):
2030     """Returns the info extractor class with the given ie_name"""
2031     return globals()[ie_name+'IE']