youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.metacafe import MetacafeIE
  37 from .extractor.mixcloud import MixcloudIE
  38 from .extractor.mtv import MTVIE
  39 from .extractor.myvideo import MyVideoIE
  40 from .extractor.nba import NBAIE
  41 from .extractor.statigram import StatigramIE
  42 from .extractor.photobucket import PhotobucketIE
  43 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  44 from .extractor.stanfordoc import StanfordOpenClassroomIE
  45 from .extractor.steam import SteamIE
  46 from .extractor.ted import TEDIE
  47 from .extractor.vimeo import VimeoIE
  48 from .extractor.worldstarhiphop import WorldStarHipHopIE
  49 from .extractor.xnxx import XNXXIE
  50 from .extractor.xvideos import XVideosIE
  51 from .extractor.yahoo import YahooIE, YahooSearchIE
  52 from .extractor.youku import YoukuIE
  53 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  54 from .extractor.zdf import ZDFIE
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68 class JustinTVIE(InfoExtractor):
  69     """Information extractor for justin.tv and twitch.tv"""
  70     # TODO: One broadcast may be split into multiple videos. The key
  71     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
  72     # starts at 1 and increases. Can we treat all parts as one video?
  73
  74     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
  75         (?:
  76             (?P<channelid>[^/]+)|
  77             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
  78             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
  79         )
  80         /?(?:\#.*)?$
  81         """
  82     _JUSTIN_PAGE_LIMIT = 100
  83     IE_NAME = u'justin.tv'
  84
  85     def report_download_page(self, channel, offset):
  86         """Report attempt to download a single page of videos."""
  87         self.to_screen(u'%s: Downloading video information from %d to %d' %
  88                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
  89
  90     # Return count of items, list of *valid* items
  91     def _parse_page(self, url, video_id):
  92         webpage = self._download_webpage(url, video_id,
  93                                          u'Downloading video info JSON',
  94                                          u'unable to download video info JSON')
  95
  96         response = json.loads(webpage)
  97         if type(response) != list:
  98             error_text = response.get('error', 'unknown error')
  99             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 100         info = []
 101         for clip in response:
 102             video_url = clip['video_file_url']
 103             if video_url:
 104                 video_extension = os.path.splitext(video_url)[1][1:]
 105                 video_date = re.sub('-', '', clip['start_time'][:10])
 106                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 107                 video_id = clip['id']
 108                 video_title = clip.get('title', video_id)
 109                 info.append({
 110                     'id': video_id,
 111                     'url': video_url,
 112                     'title': video_title,
 113                     'uploader': clip.get('channel_name', video_uploader_id),
 114                     'uploader_id': video_uploader_id,
 115                     'upload_date': video_date,
 116                     'ext': video_extension,
 117                 })
 118         return (len(response), info)
 119
 120     def _real_extract(self, url):
 121         mobj = re.match(self._VALID_URL, url)
 122         if mobj is None:
 123             raise ExtractorError(u'invalid URL: %s' % url)
 124
 125         api_base = 'http://api.justin.tv'
 126         paged = False
 127         if mobj.group('channelid'):
 128             paged = True
 129             video_id = mobj.group('channelid')
 130             api = api_base + '/channel/archives/%s.json' % video_id
 131         elif mobj.group('chapterid'):
 132             chapter_id = mobj.group('chapterid')
 133
 134             webpage = self._download_webpage(url, chapter_id)
 135             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 136             if not m:
 137                 raise ExtractorError(u'Cannot find archive of a chapter')
 138             archive_id = m.group(1)
 139
 140             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 141             chapter_info_xml = self._download_webpage(api, chapter_id,
 142                                              note=u'Downloading chapter information',
 143                                              errnote=u'Chapter information download failed')
 144             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 145             for a in doc.findall('.//archive'):
 146                 if archive_id == a.find('./id').text:
 147                     break
 148             else:
 149                 raise ExtractorError(u'Could not find chapter in chapter information')
 150
 151             video_url = a.find('./video_file_url').text
 152             video_ext = video_url.rpartition('.')[2] or u'flv'
 153
 154             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 155             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 156                                    note='Downloading chapter metadata',
 157                                    errnote='Download of chapter metadata failed')
 158             chapter_info = json.loads(chapter_info_json)
 159
 160             bracket_start = int(doc.find('.//bracket_start').text)
 161             bracket_end = int(doc.find('.//bracket_end').text)
 162
 163             # TODO determine start (and probably fix up file)
 164             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 165             #video_url += u'?start=' + TODO:start_timestamp
 166             # bracket_start is 13290, but we want 51670615
 167             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 168                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 169
 170             info = {
 171                 'id': u'c' + chapter_id,
 172                 'url': video_url,
 173                 'ext': video_ext,
 174                 'title': chapter_info['title'],
 175                 'thumbnail': chapter_info['preview'],
 176                 'description': chapter_info['description'],
 177                 'uploader': chapter_info['channel']['display_name'],
 178                 'uploader_id': chapter_info['channel']['name'],
 179             }
 180             return [info]
 181         else:
 182             video_id = mobj.group('videoid')
 183             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 184
 185         self.report_extraction(video_id)
 186
 187         info = []
 188         offset = 0
 189         limit = self._JUSTIN_PAGE_LIMIT
 190         while True:
 191             if paged:
 192                 self.report_download_page(video_id, offset)
 193             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 194             page_count, page_info = self._parse_page(page_url, video_id)
 195             info.extend(page_info)
 196             if not paged or page_count != limit:
 197                 break
 198             offset += limit
 199         return info
 200
 201
 202
 203 class UstreamIE(InfoExtractor):
 204     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 205     IE_NAME = u'ustream'
 206
 207     def _real_extract(self, url):
 208         m = re.match(self._VALID_URL, url)
 209         video_id = m.group('videoID')
 210
 211         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 212         webpage = self._download_webpage(url, video_id)
 213
 214         self.report_extraction(video_id)
 215
 216         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 217             webpage, u'title')
 218
 219         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 220             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 221
 222         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 223             webpage, u'thumbnail', fatal=False)
 224
 225         info = {
 226                 'id': video_id,
 227                 'url': video_url,
 228                 'ext': 'flv',
 229                 'title': video_title,
 230                 'uploader': uploader,
 231                 'thumbnail': thumbnail,
 232                }
 233         return info
 234
 235
 236 class RBMARadioIE(InfoExtractor):
 237     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 238
 239     def _real_extract(self, url):
 240         m = re.match(self._VALID_URL, url)
 241         video_id = m.group('videoID')
 242
 243         webpage = self._download_webpage(url, video_id)
 244
 245         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 246             webpage, u'json data', flags=re.MULTILINE)
 247
 248         try:
 249             data = json.loads(json_data)
 250         except ValueError as e:
 251             raise ExtractorError(u'Invalid JSON: ' + str(e))
 252
 253         video_url = data['akamai_url'] + '&cbr=256'
 254         url_parts = compat_urllib_parse_urlparse(video_url)
 255         video_ext = url_parts.path.rpartition('.')[2]
 256         info = {
 257                 'id': video_id,
 258                 'url': video_url,
 259                 'ext': video_ext,
 260                 'title': data['title'],
 261                 'description': data.get('teaser_text'),
 262                 'location': data.get('country_of_origin'),
 263                 'uploader': data.get('host', {}).get('name'),
 264                 'uploader_id': data.get('host', {}).get('slug'),
 265                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 266                 'duration': data.get('duration'),
 267         }
 268         return [info]
 269
 270
 271 class YouPornIE(InfoExtractor):
 272     """Information extractor for youporn.com."""
 273     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 274
 275     def _print_formats(self, formats):
 276         """Print all available formats"""
 277         print(u'Available formats:')
 278         print(u'ext\t\tformat')
 279         print(u'---------------------------------')
 280         for format in formats:
 281             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 282
 283     def _specific(self, req_format, formats):
 284         for x in formats:
 285             if(x["format"]==req_format):
 286                 return x
 287         return None
 288
 289     def _real_extract(self, url):
 290         mobj = re.match(self._VALID_URL, url)
 291         if mobj is None:
 292             raise ExtractorError(u'Invalid URL: %s' % url)
 293         video_id = mobj.group('videoid')
 294
 295         req = compat_urllib_request.Request(url)
 296         req.add_header('Cookie', 'age_verified=1')
 297         webpage = self._download_webpage(req, video_id)
 298
 299         # Get JSON parameters
 300         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 301         try:
 302             params = json.loads(json_params)
 303         except:
 304             raise ExtractorError(u'Invalid JSON')
 305
 306         self.report_extraction(video_id)
 307         try:
 308             video_title = params['title']
 309             upload_date = unified_strdate(params['release_date_f'])
 310             video_description = params['description']
 311             video_uploader = params['submitted_by']
 312             thumbnail = params['thumbnails'][0]['image']
 313         except KeyError:
 314             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 315
 316         # Get all of the formats available
 317         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 318         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 319             webpage, u'download list').strip()
 320
 321         # Get all of the links from the page
 322         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 323         links = re.findall(LINK_RE, download_list_html)
 324         if(len(links) == 0):
 325             raise ExtractorError(u'ERROR: no known formats available for video')
 326
 327         self.to_screen(u'Links found: %d' % len(links))
 328
 329         formats = []
 330         for link in links:
 331
 332             # A link looks like this:
 333             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 334             # A path looks like this:
 335             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 336             video_url = unescapeHTML( link )
 337             path = compat_urllib_parse_urlparse( video_url ).path
 338             extension = os.path.splitext( path )[1][1:]
 339             format = path.split('/')[4].split('_')[:2]
 340             size = format[0]
 341             bitrate = format[1]
 342             format = "-".join( format )
 343             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 344
 345             formats.append({
 346                 'id': video_id,
 347                 'url': video_url,
 348                 'uploader': video_uploader,
 349                 'upload_date': upload_date,
 350                 'title': video_title,
 351                 'ext': extension,
 352                 'format': format,
 353                 'thumbnail': thumbnail,
 354                 'description': video_description
 355             })
 356
 357         if self._downloader.params.get('listformats', None):
 358             self._print_formats(formats)
 359             return
 360
 361         req_format = self._downloader.params.get('format', None)
 362         self.to_screen(u'Format: %s' % req_format)
 363
 364         if req_format is None or req_format == 'best':
 365             return [formats[0]]
 366         elif req_format == 'worst':
 367             return [formats[-1]]
 368         elif req_format in ('-1', 'all'):
 369             return formats
 370         else:
 371             format = self._specific( req_format, formats )
 372             if result is None:
 373                 raise ExtractorError(u'Requested format not available')
 374             return [format]
 375
 376
 377
 378 class PornotubeIE(InfoExtractor):
 379     """Information extractor for pornotube.com."""
 380     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 381
 382     def _real_extract(self, url):
 383         mobj = re.match(self._VALID_URL, url)
 384         if mobj is None:
 385             raise ExtractorError(u'Invalid URL: %s' % url)
 386
 387         video_id = mobj.group('videoid')
 388         video_title = mobj.group('title')
 389
 390         # Get webpage content
 391         webpage = self._download_webpage(url, video_id)
 392
 393         # Get the video URL
 394         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 395         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 396         video_url = compat_urllib_parse.unquote(video_url)
 397
 398         #Get the uploaded date
 399         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 400         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 401         if upload_date: upload_date = unified_strdate(upload_date)
 402
 403         info = {'id': video_id,
 404                 'url': video_url,
 405                 'uploader': None,
 406                 'upload_date': upload_date,
 407                 'title': video_title,
 408                 'ext': 'flv',
 409                 'format': 'flv'}
 410
 411         return [info]
 412
 413 class YouJizzIE(InfoExtractor):
 414     """Information extractor for youjizz.com."""
 415     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 416
 417     def _real_extract(self, url):
 418         mobj = re.match(self._VALID_URL, url)
 419         if mobj is None:
 420             raise ExtractorError(u'Invalid URL: %s' % url)
 421
 422         video_id = mobj.group('videoid')
 423
 424         # Get webpage content
 425         webpage = self._download_webpage(url, video_id)
 426
 427         # Get the video title
 428         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 429             webpage, u'title').strip()
 430
 431         # Get the embed page
 432         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 433         if result is None:
 434             raise ExtractorError(u'ERROR: unable to extract embed page')
 435
 436         embed_page_url = result.group(0).strip()
 437         video_id = result.group('videoid')
 438
 439         webpage = self._download_webpage(embed_page_url, video_id)
 440
 441         # Get the video URL
 442         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 443             webpage, u'video URL')
 444
 445         info = {'id': video_id,
 446                 'url': video_url,
 447                 'title': video_title,
 448                 'ext': 'flv',
 449                 'format': 'flv',
 450                 'player_url': embed_page_url}
 451
 452         return [info]
 453
 454 class EightTracksIE(InfoExtractor):
 455     IE_NAME = '8tracks'
 456     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 457
 458     def _real_extract(self, url):
 459         mobj = re.match(self._VALID_URL, url)
 460         if mobj is None:
 461             raise ExtractorError(u'Invalid URL: %s' % url)
 462         playlist_id = mobj.group('id')
 463
 464         webpage = self._download_webpage(url, playlist_id)
 465
 466         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 467         data = json.loads(json_like)
 468
 469         session = str(random.randint(0, 1000000000))
 470         mix_id = data['id']
 471         track_count = data['tracks_count']
 472         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 473         next_url = first_url
 474         res = []
 475         for i in itertools.count():
 476             api_json = self._download_webpage(next_url, playlist_id,
 477                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 478                 errnote=u'Failed to download song information')
 479             api_data = json.loads(api_json)
 480             track_data = api_data[u'set']['track']
 481             info = {
 482                 'id': track_data['id'],
 483                 'url': track_data['track_file_stream_url'],
 484                 'title': track_data['performer'] + u' - ' + track_data['name'],
 485                 'raw_title': track_data['name'],
 486                 'uploader_id': data['user']['login'],
 487                 'ext': 'm4a',
 488             }
 489             res.append(info)
 490             if api_data['set']['at_last_track']:
 491                 break
 492             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 493         return res
 494
 495 class KeekIE(InfoExtractor):
 496     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 497     IE_NAME = u'keek'
 498
 499     def _real_extract(self, url):
 500         m = re.match(self._VALID_URL, url)
 501         video_id = m.group('videoID')
 502
 503         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 504         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 505         webpage = self._download_webpage(url, video_id)
 506
 507         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 508             webpage, u'title')
 509
 510         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 511             webpage, u'uploader', fatal=False)
 512
 513         info = {
 514                 'id': video_id,
 515                 'url': video_url,
 516                 'ext': 'mp4',
 517                 'title': video_title,
 518                 'thumbnail': thumbnail,
 519                 'uploader': uploader
 520         }
 521         return [info]
 522
 523
 524 class MySpassIE(InfoExtractor):
 525     _VALID_URL = r'http://www.myspass.de/.*'
 526
 527     def _real_extract(self, url):
 528         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 529
 530         # video id is the last path element of the URL
 531         # usually there is a trailing slash, so also try the second but last
 532         url_path = compat_urllib_parse_urlparse(url).path
 533         url_parent_path, video_id = os.path.split(url_path)
 534         if not video_id:
 535             _, video_id = os.path.split(url_parent_path)
 536
 537         # get metadata
 538         metadata_url = META_DATA_URL_TEMPLATE % video_id
 539         metadata_text = self._download_webpage(metadata_url, video_id)
 540         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 541
 542         # extract values from metadata
 543         url_flv_el = metadata.find('url_flv')
 544         if url_flv_el is None:
 545             raise ExtractorError(u'Unable to extract download url')
 546         video_url = url_flv_el.text
 547         extension = os.path.splitext(video_url)[1][1:]
 548         title_el = metadata.find('title')
 549         if title_el is None:
 550             raise ExtractorError(u'Unable to extract title')
 551         title = title_el.text
 552         format_id_el = metadata.find('format_id')
 553         if format_id_el is None:
 554             format = ext
 555         else:
 556             format = format_id_el.text
 557         description_el = metadata.find('description')
 558         if description_el is not None:
 559             description = description_el.text
 560         else:
 561             description = None
 562         imagePreview_el = metadata.find('imagePreview')
 563         if imagePreview_el is not None:
 564             thumbnail = imagePreview_el.text
 565         else:
 566             thumbnail = None
 567         info = {
 568             'id': video_id,
 569             'url': video_url,
 570             'title': title,
 571             'ext': extension,
 572             'format': format,
 573             'thumbnail': thumbnail,
 574             'description': description
 575         }
 576         return [info]
 577
 578 class SpiegelIE(InfoExtractor):
 579     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 580
 581     def _real_extract(self, url):
 582         m = re.match(self._VALID_URL, url)
 583         video_id = m.group('videoID')
 584
 585         webpage = self._download_webpage(url, video_id)
 586
 587         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 588             webpage, u'title')
 589
 590         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 591         xml_code = self._download_webpage(xml_url, video_id,
 592                     note=u'Downloading XML', errnote=u'Failed to download XML')
 593
 594         idoc = xml.etree.ElementTree.fromstring(xml_code)
 595         last_type = idoc[-1]
 596         filename = last_type.findall('./filename')[0].text
 597         duration = float(last_type.findall('./duration')[0].text)
 598
 599         video_url = 'http://video2.spiegel.de/flash/' + filename
 600         video_ext = filename.rpartition('.')[2]
 601         info = {
 602             'id': video_id,
 603             'url': video_url,
 604             'ext': video_ext,
 605             'title': video_title,
 606             'duration': duration,
 607         }
 608         return [info]
 609
 610 class LiveLeakIE(InfoExtractor):
 611
 612     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 613     IE_NAME = u'liveleak'
 614
 615     def _real_extract(self, url):
 616         mobj = re.match(self._VALID_URL, url)
 617         if mobj is None:
 618             raise ExtractorError(u'Invalid URL: %s' % url)
 619
 620         video_id = mobj.group('video_id')
 621
 622         webpage = self._download_webpage(url, video_id)
 623
 624         video_url = self._search_regex(r'file: "(.*?)",',
 625             webpage, u'video URL')
 626
 627         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 628             webpage, u'title').replace('LiveLeak.com -', '').strip()
 629
 630         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 631             webpage, u'description', fatal=False)
 632
 633         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 634             webpage, u'uploader', fatal=False)
 635
 636         info = {
 637             'id':  video_id,
 638             'url': video_url,
 639             'ext': 'mp4',
 640             'title': video_title,
 641             'description': video_description,
 642             'uploader': video_uploader
 643         }
 644
 645         return [info]
 646
 647
 648
 649 class TumblrIE(InfoExtractor):
 650     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 651
 652     def _real_extract(self, url):
 653         m_url = re.match(self._VALID_URL, url)
 654         video_id = m_url.group('id')
 655         blog = m_url.group('blog_name')
 656
 657         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 658         webpage = self._download_webpage(url, video_id)
 659
 660         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 661         video = re.search(re_video, webpage)
 662         if video is None:
 663            raise ExtractorError(u'Unable to extract video')
 664         video_url = video.group('video_url')
 665         ext = video.group('ext')
 666
 667         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 668             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 669         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 670
 671         # The only place where you can get a title, it's not complete,
 672         # but searching in other places doesn't work for all videos
 673         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 674             webpage, u'title', flags=re.DOTALL)
 675
 676         return [{'id': video_id,
 677                  'url': video_url,
 678                  'title': video_title,
 679                  'thumbnail': video_thumbnail,
 680                  'ext': ext
 681                  }]
 682
 683 class BandcampIE(InfoExtractor):
 684     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 685
 686     def _real_extract(self, url):
 687         mobj = re.match(self._VALID_URL, url)
 688         title = mobj.group('title')
 689         webpage = self._download_webpage(url, title)
 690         # We get the link to the free download page
 691         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 692         if m_download is None:
 693             raise ExtractorError(u'No free songs found')
 694
 695         download_link = m_download.group(1)
 696         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 697                        webpage, re.MULTILINE|re.DOTALL).group('id')
 698
 699         download_webpage = self._download_webpage(download_link, id,
 700                                                   'Downloading free downloads page')
 701         # We get the dictionary of the track from some javascrip code
 702         info = re.search(r'items: (.*?),$',
 703                          download_webpage, re.MULTILINE).group(1)
 704         info = json.loads(info)[0]
 705         # We pick mp3-320 for now, until format selection can be easily implemented.
 706         mp3_info = info[u'downloads'][u'mp3-320']
 707         # If we try to use this url it says the link has expired
 708         initial_url = mp3_info[u'url']
 709         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 710         m_url = re.match(re_url, initial_url)
 711         #We build the url we will use to get the final track url
 712         # This url is build in Bandcamp in the script download_bunde_*.js
 713         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 714         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 715         # If we could correctly generate the .rand field the url would be
 716         #in the "download_url" key
 717         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 718
 719         track_info = {'id':id,
 720                       'title' : info[u'title'],
 721                       'ext' :   'mp3',
 722                       'url' :   final_url,
 723                       'thumbnail' : info[u'thumb_url'],
 724                       'uploader' :  info[u'artist']
 725                       }
 726
 727         return [track_info]
 728
 729 class RedTubeIE(InfoExtractor):
 730     """Information Extractor for redtube"""
 731     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 732
 733     def _real_extract(self,url):
 734         mobj = re.match(self._VALID_URL, url)
 735         if mobj is None:
 736             raise ExtractorError(u'Invalid URL: %s' % url)
 737
 738         video_id = mobj.group('id')
 739         video_extension = 'mp4'
 740         webpage = self._download_webpage(url, video_id)
 741
 742         self.report_extraction(video_id)
 743
 744         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 745             webpage, u'video URL')
 746
 747         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 748             webpage, u'title')
 749
 750         return [{
 751             'id':       video_id,
 752             'url':      video_url,
 753             'ext':      video_extension,
 754             'title':    video_title,
 755         }]
 756
 757 class InaIE(InfoExtractor):
 758     """Information Extractor for Ina.fr"""
 759     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 760
 761     def _real_extract(self,url):
 762         mobj = re.match(self._VALID_URL, url)
 763
 764         video_id = mobj.group('id')
 765         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 766         video_extension = 'mp4'
 767         webpage = self._download_webpage(mrss_url, video_id)
 768
 769         self.report_extraction(video_id)
 770
 771         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 772             webpage, u'video URL')
 773
 774         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 775             webpage, u'title')
 776
 777         return [{
 778             'id':       video_id,
 779             'url':      video_url,
 780             'ext':      video_extension,
 781             'title':    video_title,
 782         }]
 783
 784 class HowcastIE(InfoExtractor):
 785     """Information Extractor for Howcast.com"""
 786     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 787
 788     def _real_extract(self, url):
 789         mobj = re.match(self._VALID_URL, url)
 790
 791         video_id = mobj.group('id')
 792         webpage_url = 'http://www.howcast.com/videos/' + video_id
 793         webpage = self._download_webpage(webpage_url, video_id)
 794
 795         self.report_extraction(video_id)
 796
 797         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 798             webpage, u'video URL')
 799
 800         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 801             webpage, u'title')
 802
 803         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 804             webpage, u'description', fatal=False)
 805
 806         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 807             webpage, u'thumbnail', fatal=False)
 808
 809         return [{
 810             'id':       video_id,
 811             'url':      video_url,
 812             'ext':      'mp4',
 813             'title':    video_title,
 814             'description': video_description,
 815             'thumbnail': thumbnail,
 816         }]
 817
 818 class VineIE(InfoExtractor):
 819     """Information Extractor for Vine.co"""
 820     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 821
 822     def _real_extract(self, url):
 823         mobj = re.match(self._VALID_URL, url)
 824
 825         video_id = mobj.group('id')
 826         webpage_url = 'https://vine.co/v/' + video_id
 827         webpage = self._download_webpage(webpage_url, video_id)
 828
 829         self.report_extraction(video_id)
 830
 831         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 832             webpage, u'video URL')
 833
 834         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 835             webpage, u'title')
 836
 837         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 838             webpage, u'thumbnail', fatal=False)
 839
 840         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 841             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 842
 843         return [{
 844             'id':        video_id,
 845             'url':       video_url,
 846             'ext':       'mp4',
 847             'title':     video_title,
 848             'thumbnail': thumbnail,
 849             'uploader':  uploader,
 850         }]
 851
 852 class FlickrIE(InfoExtractor):
 853     """Information Extractor for Flickr videos"""
 854     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 855
 856     def _real_extract(self, url):
 857         mobj = re.match(self._VALID_URL, url)
 858
 859         video_id = mobj.group('id')
 860         video_uploader_id = mobj.group('uploader_id')
 861         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 862         webpage = self._download_webpage(webpage_url, video_id)
 863
 864         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 865
 866         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 867         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 868
 869         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 870             first_xml, u'node_id')
 871
 872         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 873         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 874
 875         self.report_extraction(video_id)
 876
 877         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 878         if mobj is None:
 879             raise ExtractorError(u'Unable to extract video url')
 880         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 881
 882         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 883             webpage, u'video title')
 884
 885         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 886             webpage, u'description', fatal=False)
 887
 888         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 889             webpage, u'thumbnail', fatal=False)
 890
 891         return [{
 892             'id':          video_id,
 893             'url':         video_url,
 894             'ext':         'mp4',
 895             'title':       video_title,
 896             'description': video_description,
 897             'thumbnail':   thumbnail,
 898             'uploader_id': video_uploader_id,
 899         }]
 900
 901 class TeamcocoIE(InfoExtractor):
 902     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 903
 904     def _real_extract(self, url):
 905         mobj = re.match(self._VALID_URL, url)
 906         if mobj is None:
 907             raise ExtractorError(u'Invalid URL: %s' % url)
 908         url_title = mobj.group('url_title')
 909         webpage = self._download_webpage(url, url_title)
 910
 911         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 912             webpage, u'video id')
 913
 914         self.report_extraction(video_id)
 915
 916         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 917             webpage, u'title')
 918
 919         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 920             webpage, u'thumbnail', fatal=False)
 921
 922         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 923             webpage, u'description', fatal=False)
 924
 925         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 926         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 927
 928         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 929             data, u'video URL')
 930
 931         return [{
 932             'id':          video_id,
 933             'url':         video_url,
 934             'ext':         'mp4',
 935             'title':       video_title,
 936             'thumbnail':   thumbnail,
 937             'description': video_description,
 938         }]
 939
 940 class XHamsterIE(InfoExtractor):
 941     """Information Extractor for xHamster"""
 942     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 943
 944     def _real_extract(self,url):
 945         mobj = re.match(self._VALID_URL, url)
 946
 947         video_id = mobj.group('id')
 948         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 949         webpage = self._download_webpage(mrss_url, video_id)
 950
 951         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 952         if mobj is None:
 953             raise ExtractorError(u'Unable to extract media URL')
 954         if len(mobj.group('server')) == 0:
 955             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 956         else:
 957             video_url = mobj.group('server')+'/key='+mobj.group('file')
 958         video_extension = video_url.split('.')[-1]
 959
 960         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 961             webpage, u'title')
 962
 963         # Can't see the description anywhere in the UI
 964         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 965         #     webpage, u'description', fatal=False)
 966         # if video_description: video_description = unescapeHTML(video_description)
 967
 968         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 969         if mobj:
 970             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 971         else:
 972             video_upload_date = None
 973             self._downloader.report_warning(u'Unable to extract upload date')
 974
 975         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 976             webpage, u'uploader id', default=u'anonymous')
 977
 978         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 979             webpage, u'thumbnail', fatal=False)
 980
 981         return [{
 982             'id':       video_id,
 983             'url':      video_url,
 984             'ext':      video_extension,
 985             'title':    video_title,
 986             # 'description': video_description,
 987             'upload_date': video_upload_date,
 988             'uploader_id': video_uploader_id,
 989             'thumbnail': video_thumbnail
 990         }]
 991
 992 class HypemIE(InfoExtractor):
 993     """Information Extractor for hypem"""
 994     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 995
 996     def _real_extract(self, url):
 997         mobj = re.match(self._VALID_URL, url)
 998         if mobj is None:
 999             raise ExtractorError(u'Invalid URL: %s' % url)
1000         track_id = mobj.group(1)
1001
1002         data = { 'ax': 1, 'ts': time.time() }
1003         data_encoded = compat_urllib_parse.urlencode(data)
1004         complete_url = url + "?" + data_encoded
1005         request = compat_urllib_request.Request(complete_url)
1006         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1007         cookie = urlh.headers.get('Set-Cookie', '')
1008
1009         self.report_extraction(track_id)
1010
1011         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1012             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1013         try:
1014             track_list = json.loads(html_tracks)
1015             track = track_list[u'tracks'][0]
1016         except ValueError:
1017             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1018
1019         key = track[u"key"]
1020         track_id = track[u"id"]
1021         artist = track[u"artist"]
1022         title = track[u"song"]
1023
1024         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1025         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1026         request.add_header('cookie', cookie)
1027         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1028         try:
1029             song_data = json.loads(song_data_json)
1030         except ValueError:
1031             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1032         final_url = song_data[u"url"]
1033
1034         return [{
1035             'id':       track_id,
1036             'url':      final_url,
1037             'ext':      "mp3",
1038             'title':    title,
1039             'artist':   artist,
1040         }]
1041
1042 class Vbox7IE(InfoExtractor):
1043     """Information Extractor for Vbox7"""
1044     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1045
1046     def _real_extract(self,url):
1047         mobj = re.match(self._VALID_URL, url)
1048         if mobj is None:
1049             raise ExtractorError(u'Invalid URL: %s' % url)
1050         video_id = mobj.group(1)
1051
1052         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1053         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1054         redirect_url = urlh.geturl() + new_location
1055         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1056
1057         title = self._html_search_regex(r'<title>(.*)</title>',
1058             webpage, u'title').split('/')[0].strip()
1059
1060         ext = "flv"
1061         info_url = "http://vbox7.com/play/magare.do"
1062         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1063         info_request = compat_urllib_request.Request(info_url, data)
1064         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1065         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1066         if info_response is None:
1067             raise ExtractorError(u'Unable to extract the media url')
1068         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1069
1070         return [{
1071             'id':        video_id,
1072             'url':       final_url,
1073             'ext':       ext,
1074             'title':     title,
1075             'thumbnail': thumbnail_url,
1076         }]
1077
1078
1079 def gen_extractors():
1080     """ Return a list of an instance of every supported extractor.
1081     The order does matter; the first extractor matched is the one handling the URL.
1082     """
1083     return [
1084         YoutubePlaylistIE(),
1085         YoutubeChannelIE(),
1086         YoutubeUserIE(),
1087         YoutubeSearchIE(),
1088         YoutubeIE(),
1089         MetacafeIE(),
1090         DailymotionIE(),
1091         GoogleSearchIE(),
1092         PhotobucketIE(),
1093         YahooIE(),
1094         YahooSearchIE(),
1095         DepositFilesIE(),
1096         FacebookIE(),
1097         BlipTVIE(),
1098         BlipTVUserIE(),
1099         VimeoIE(),
1100         MyVideoIE(),
1101         ComedyCentralIE(),
1102         EscapistIE(),
1103         CollegeHumorIE(),
1104         XVideosIE(),
1105         SoundcloudSetIE(),
1106         SoundcloudIE(),
1107         InfoQIE(),
1108         MixcloudIE(),
1109         StanfordOpenClassroomIE(),
1110         MTVIE(),
1111         YoukuIE(),
1112         XNXXIE(),
1113         YouJizzIE(),
1114         PornotubeIE(),
1115         YouPornIE(),
1116         GooglePlusIE(),
1117         ArteTvIE(),
1118         NBAIE(),
1119         WorldStarHipHopIE(),
1120         JustinTVIE(),
1121         FunnyOrDieIE(),
1122         SteamIE(),
1123         UstreamIE(),
1124         RBMARadioIE(),
1125         EightTracksIE(),
1126         KeekIE(),
1127         TEDIE(),
1128         MySpassIE(),
1129         SpiegelIE(),
1130         LiveLeakIE(),
1131         ARDIE(),
1132         ZDFIE(),
1133         TumblrIE(),
1134         BandcampIE(),
1135         RedTubeIE(),
1136         InaIE(),
1137         HowcastIE(),
1138         VineIE(),
1139         FlickrIE(),
1140         TeamcocoIE(),
1141         XHamsterIE(),
1142         HypemIE(),
1143         Vbox7IE(),
1144         GametrailersIE(),
1145         StatigramIE(),
1146         GenericIE()
1147     ]
1148
1149 def get_info_extractor(ie_name):
1150     """Returns the info extractor class with the given ie_name"""
1151     return globals()[ie_name+'IE']