youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bandcamp import BandcampIE
  24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  25 from .extractor.comedycentral import ComedyCentralIE
  26 from .extractor.collegehumor import CollegeHumorIE
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.depositfiles import DepositFilesIE
  29 from .extractor.eighttracks import EightTracksIE
  30 from .extractor.escapist import EscapistIE
  31 from .extractor.facebook import FacebookIE
  32 from .extractor.funnyordie import FunnyOrDieIE
  33 from .extractor.gametrailers import GametrailersIE
  34 from .extractor.generic import GenericIE
  35 from .extractor.googleplus import GooglePlusIE
  36 from .extractor.googlesearch import GoogleSearchIE
  37 from .extractor.infoq import InfoQIE
  38 from .extractor.justintv import JustinTVIE
  39 from .extractor.keek import KeekIE
  40 from .extractor.liveleak import LiveLeakIE
  41 from .extractor.metacafe import MetacafeIE
  42 from .extractor.mixcloud import MixcloudIE
  43 from .extractor.mtv import MTVIE
  44 from .extractor.myspass import MySpassIE
  45 from .extractor.myvideo import MyVideoIE
  46 from .extractor.nba import NBAIE
  47 from .extractor.statigram import StatigramIE
  48 from .extractor.photobucket import PhotobucketIE
  49 from .extractor.pornotube import PornotubeIE
  50 from .extractor.rbmaradio import RBMARadioIE
  51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  52 from .extractor.spiegel import SpiegelIE
  53 from .extractor.stanfordoc import StanfordOpenClassroomIE
  54 from .extractor.steam import SteamIE
  55 from .extractor.ted import TEDIE
  56 from .extractor.tumblr import TumblrIE
  57 from .extractor.ustream import UstreamIE
  58 from .extractor.vbox7 import Vbox7IE
  59 from .extractor.vimeo import VimeoIE
  60 from .extractor.vine import VineIE
  61 from .extractor.worldstarhiphop import WorldStarHipHopIE
  62 from .extractor.xnxx import XNXXIE
  63 from .extractor.xvideos import XVideosIE
  64 from .extractor.yahoo import YahooIE, YahooSearchIE
  65 from .extractor.youjizz import YouJizzIE
  66 from .extractor.youku import YoukuIE
  67 from .extractor.youporn import YouPornIE
  68 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  69 from .extractor.zdf import ZDFIE
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105 class RedTubeIE(InfoExtractor):
 106     """Information Extractor for redtube"""
 107     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 108
 109     def _real_extract(self,url):
 110         mobj = re.match(self._VALID_URL, url)
 111         if mobj is None:
 112             raise ExtractorError(u'Invalid URL: %s' % url)
 113
 114         video_id = mobj.group('id')
 115         video_extension = 'mp4'
 116         webpage = self._download_webpage(url, video_id)
 117
 118         self.report_extraction(video_id)
 119
 120         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 121             webpage, u'video URL')
 122
 123         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 124             webpage, u'title')
 125
 126         return [{
 127             'id':       video_id,
 128             'url':      video_url,
 129             'ext':      video_extension,
 130             'title':    video_title,
 131         }]
 132
 133 class InaIE(InfoExtractor):
 134     """Information Extractor for Ina.fr"""
 135     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 136
 137     def _real_extract(self,url):
 138         mobj = re.match(self._VALID_URL, url)
 139
 140         video_id = mobj.group('id')
 141         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 142         video_extension = 'mp4'
 143         webpage = self._download_webpage(mrss_url, video_id)
 144
 145         self.report_extraction(video_id)
 146
 147         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 148             webpage, u'video URL')
 149
 150         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 151             webpage, u'title')
 152
 153         return [{
 154             'id':       video_id,
 155             'url':      video_url,
 156             'ext':      video_extension,
 157             'title':    video_title,
 158         }]
 159
 160 class HowcastIE(InfoExtractor):
 161     """Information Extractor for Howcast.com"""
 162     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 163
 164     def _real_extract(self, url):
 165         mobj = re.match(self._VALID_URL, url)
 166
 167         video_id = mobj.group('id')
 168         webpage_url = 'http://www.howcast.com/videos/' + video_id
 169         webpage = self._download_webpage(webpage_url, video_id)
 170
 171         self.report_extraction(video_id)
 172
 173         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 174             webpage, u'video URL')
 175
 176         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 177             webpage, u'title')
 178
 179         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 180             webpage, u'description', fatal=False)
 181
 182         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 183             webpage, u'thumbnail', fatal=False)
 184
 185         return [{
 186             'id':       video_id,
 187             'url':      video_url,
 188             'ext':      'mp4',
 189             'title':    video_title,
 190             'description': video_description,
 191             'thumbnail': thumbnail,
 192         }]
 193
 194
 195 class FlickrIE(InfoExtractor):
 196     """Information Extractor for Flickr videos"""
 197     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 198
 199     def _real_extract(self, url):
 200         mobj = re.match(self._VALID_URL, url)
 201
 202         video_id = mobj.group('id')
 203         video_uploader_id = mobj.group('uploader_id')
 204         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 205         webpage = self._download_webpage(webpage_url, video_id)
 206
 207         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 208
 209         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 210         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 211
 212         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 213             first_xml, u'node_id')
 214
 215         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 216         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 217
 218         self.report_extraction(video_id)
 219
 220         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 221         if mobj is None:
 222             raise ExtractorError(u'Unable to extract video url')
 223         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 224
 225         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 226             webpage, u'video title')
 227
 228         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 229             webpage, u'description', fatal=False)
 230
 231         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 232             webpage, u'thumbnail', fatal=False)
 233
 234         return [{
 235             'id':          video_id,
 236             'url':         video_url,
 237             'ext':         'mp4',
 238             'title':       video_title,
 239             'description': video_description,
 240             'thumbnail':   thumbnail,
 241             'uploader_id': video_uploader_id,
 242         }]
 243
 244 class TeamcocoIE(InfoExtractor):
 245     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 246
 247     def _real_extract(self, url):
 248         mobj = re.match(self._VALID_URL, url)
 249         if mobj is None:
 250             raise ExtractorError(u'Invalid URL: %s' % url)
 251         url_title = mobj.group('url_title')
 252         webpage = self._download_webpage(url, url_title)
 253
 254         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 255             webpage, u'video id')
 256
 257         self.report_extraction(video_id)
 258
 259         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 260             webpage, u'title')
 261
 262         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 263             webpage, u'thumbnail', fatal=False)
 264
 265         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 266             webpage, u'description', fatal=False)
 267
 268         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 269         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 270
 271         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 272             data, u'video URL')
 273
 274         return [{
 275             'id':          video_id,
 276             'url':         video_url,
 277             'ext':         'mp4',
 278             'title':       video_title,
 279             'thumbnail':   thumbnail,
 280             'description': video_description,
 281         }]
 282
 283 class XHamsterIE(InfoExtractor):
 284     """Information Extractor for xHamster"""
 285     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 286
 287     def _real_extract(self,url):
 288         mobj = re.match(self._VALID_URL, url)
 289
 290         video_id = mobj.group('id')
 291         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 292         webpage = self._download_webpage(mrss_url, video_id)
 293
 294         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 295         if mobj is None:
 296             raise ExtractorError(u'Unable to extract media URL')
 297         if len(mobj.group('server')) == 0:
 298             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 299         else:
 300             video_url = mobj.group('server')+'/key='+mobj.group('file')
 301         video_extension = video_url.split('.')[-1]
 302
 303         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 304             webpage, u'title')
 305
 306         # Can't see the description anywhere in the UI
 307         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 308         #     webpage, u'description', fatal=False)
 309         # if video_description: video_description = unescapeHTML(video_description)
 310
 311         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 312         if mobj:
 313             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 314         else:
 315             video_upload_date = None
 316             self._downloader.report_warning(u'Unable to extract upload date')
 317
 318         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 319             webpage, u'uploader id', default=u'anonymous')
 320
 321         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 322             webpage, u'thumbnail', fatal=False)
 323
 324         return [{
 325             'id':       video_id,
 326             'url':      video_url,
 327             'ext':      video_extension,
 328             'title':    video_title,
 329             # 'description': video_description,
 330             'upload_date': video_upload_date,
 331             'uploader_id': video_uploader_id,
 332             'thumbnail': video_thumbnail
 333         }]
 334
 335 class HypemIE(InfoExtractor):
 336     """Information Extractor for hypem"""
 337     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 338
 339     def _real_extract(self, url):
 340         mobj = re.match(self._VALID_URL, url)
 341         if mobj is None:
 342             raise ExtractorError(u'Invalid URL: %s' % url)
 343         track_id = mobj.group(1)
 344
 345         data = { 'ax': 1, 'ts': time.time() }
 346         data_encoded = compat_urllib_parse.urlencode(data)
 347         complete_url = url + "?" + data_encoded
 348         request = compat_urllib_request.Request(complete_url)
 349         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 350         cookie = urlh.headers.get('Set-Cookie', '')
 351
 352         self.report_extraction(track_id)
 353
 354         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 355             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 356         try:
 357             track_list = json.loads(html_tracks)
 358             track = track_list[u'tracks'][0]
 359         except ValueError:
 360             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 361
 362         key = track[u"key"]
 363         track_id = track[u"id"]
 364         artist = track[u"artist"]
 365         title = track[u"song"]
 366
 367         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 368         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 369         request.add_header('cookie', cookie)
 370         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 371         try:
 372             song_data = json.loads(song_data_json)
 373         except ValueError:
 374             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 375         final_url = song_data[u"url"]
 376
 377         return [{
 378             'id':       track_id,
 379             'url':      final_url,
 380             'ext':      "mp3",
 381             'title':    title,
 382             'artist':   artist,
 383         }]
 384
 385
 386
 387 def gen_extractors():
 388     """ Return a list of an instance of every supported extractor.
 389     The order does matter; the first extractor matched is the one handling the URL.
 390     """
 391     return [
 392         YoutubePlaylistIE(),
 393         YoutubeChannelIE(),
 394         YoutubeUserIE(),
 395         YoutubeSearchIE(),
 396         YoutubeIE(),
 397         MetacafeIE(),
 398         DailymotionIE(),
 399         GoogleSearchIE(),
 400         PhotobucketIE(),
 401         YahooIE(),
 402         YahooSearchIE(),
 403         DepositFilesIE(),
 404         FacebookIE(),
 405         BlipTVIE(),
 406         BlipTVUserIE(),
 407         VimeoIE(),
 408         MyVideoIE(),
 409         ComedyCentralIE(),
 410         EscapistIE(),
 411         CollegeHumorIE(),
 412         XVideosIE(),
 413         SoundcloudSetIE(),
 414         SoundcloudIE(),
 415         InfoQIE(),
 416         MixcloudIE(),
 417         StanfordOpenClassroomIE(),
 418         MTVIE(),
 419         YoukuIE(),
 420         XNXXIE(),
 421         YouJizzIE(),
 422         PornotubeIE(),
 423         YouPornIE(),
 424         GooglePlusIE(),
 425         ArteTvIE(),
 426         NBAIE(),
 427         WorldStarHipHopIE(),
 428         JustinTVIE(),
 429         FunnyOrDieIE(),
 430         SteamIE(),
 431         UstreamIE(),
 432         RBMARadioIE(),
 433         EightTracksIE(),
 434         KeekIE(),
 435         TEDIE(),
 436         MySpassIE(),
 437         SpiegelIE(),
 438         LiveLeakIE(),
 439         ARDIE(),
 440         ZDFIE(),
 441         TumblrIE(),
 442         BandcampIE(),
 443         RedTubeIE(),
 444         InaIE(),
 445         HowcastIE(),
 446         VineIE(),
 447         FlickrIE(),
 448         TeamcocoIE(),
 449         XHamsterIE(),
 450         HypemIE(),
 451         Vbox7IE(),
 452         GametrailersIE(),
 453         StatigramIE(),
 454         GenericIE()
 455     ]
 456
 457 def get_info_extractor(ie_name):
 458     """Returns the info extractor class with the given ie_name"""
 459     return globals()[ie_name+'IE']