]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
[myspass] Move into own file and default to mp4 ext
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
4fcca4bb 1import base64
d77c3dfd 2import datetime
ccf65f9d 3import itertools
d77c3dfd
FV
4import netrc
5import os
6import re
7import socket
8import time
d77c3dfd 9import email.utils
921a1455 10import xml.etree.ElementTree
302efc19 11import random
12import math
6324fd1d 13import operator
de5d66d4 14import hashlib
15import binascii
16import urllib
d77c3dfd 17
9e8056d5 18from .utils import *
d6983cb4 19from .extractor.common import InfoExtractor, SearchInfoExtractor
d5822b96
PH
20
21from .extractor.ard import ARDIE
22from .extractor.arte import ArteTvIE
f5884801 23from .extractor.bliptv import BlipTVIE, BlipTVUserIE
ea63e499 24from .extractor.comedycentral import ComedyCentralIE
7beb36a5 25from .extractor.collegehumor import CollegeHumorIE
219b8130 26from .extractor.dailymotion import DailymotionIE
426ff042 27from .extractor.depositfiles import DepositFilesIE
82840042 28from .extractor.eighttracks import EightTracksIE
15369766 29from .extractor.escapist import EscapistIE
a50e1b32 30from .extractor.facebook import FacebookIE
8f0578f0 31from .extractor.funnyordie import FunnyOrDieIE
9f4e6bba 32from .extractor.gametrailers import GametrailersIE
9b122384 33from .extractor.generic import GenericIE
7aca14a1
PH
34from .extractor.googleplus import GooglePlusIE
35from .extractor.googlesearch import GoogleSearchIE
fda7d31a 36from .extractor.infoq import InfoQIE
79e93125 37from .extractor.justintv import JustinTVIE
2c64df03 38from .extractor.keek import KeekIE
38cbc40a 39from .extractor.metacafe import MetacafeIE
80cbb6dd 40from .extractor.mixcloud import MixcloudIE
33505666 41from .extractor.mtv import MTVIE
97d2db01 42from .extractor.myspass import MySpassIE
a08dfd27 43from .extractor.myvideo import MyVideoIE
5b286728 44from .extractor.nba import NBAIE
38cbc40a 45from .extractor.statigram import StatigramIE
97d6faac 46from .extractor.photobucket import PhotobucketIE
1183b85f 47from .extractor.pornotube import PornotubeIE
e10e576f 48from .extractor.rbmaradio import RBMARadioIE
aad0d6d5 49from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
291a168b 50from .extractor.stanfordoc import StanfordOpenClassroomIE
462dc88b 51from .extractor.steam import SteamIE
9fd5ce0c 52from .extractor.ted import TEDIE
78af8eb1 53from .extractor.ustream import UstreamIE
b3d14cbf 54from .extractor.vimeo import VimeoIE
250f5578 55from .extractor.worldstarhiphop import WorldStarHipHopIE
462dc88b 56from .extractor.xnxx import XNXXIE
cbf46c73 57from .extractor.xvideos import XVideosIE
934858ad 58from .extractor.yahoo import YahooIE, YahooSearchIE
c3c77cec 59from .extractor.youjizz import YouJizzIE
9c286cfa 60from .extractor.youku import YoukuIE
0143dc02 61from .extractor.youporn import YouPornIE
b05654f0 62from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
d5822b96 63from .extractor.zdf import ZDFIE
e30e9318 64
d830b7c2 65
d77c3dfd 66
d77c3dfd 67
d77c3dfd 68
6de7ef9b 69
302efc19 70
5dc846fa
FV
71
72
fd873c69
FV
73
74
4cc3d074 75
0b40544f 76
21a9c6aa 77
d0d4f277 78
ef0c8d5f 79
4aeae91f 80
40634747 81
4aeae91f 82
991ba7fa 83
991ba7fa 84
6324fd1d 85
991ba7fa 86
991ba7fa 87
991ba7fa
JC
88
89
da06e2da 90
da06e2da 91
1ad5d872 92
e32b06e9 93class SpiegelIE(InfoExtractor):
1f46c152 94 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
e32b06e9
PH
95
96 def _real_extract(self, url):
97 m = re.match(self._VALID_URL, url)
98 video_id = m.group('videoID')
99
100 webpage = self._download_webpage(url, video_id)
ac3e9394 101
979a9dd4 102 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
ac3e9394 103 webpage, u'title')
e32b06e9
PH
104
105 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
106 xml_code = self._download_webpage(xml_url, video_id,
107 note=u'Downloading XML', errnote=u'Failed to download XML')
108
109 idoc = xml.etree.ElementTree.fromstring(xml_code)
110 last_type = idoc[-1]
111 filename = last_type.findall('./filename')[0].text
112 duration = float(last_type.findall('./duration')[0].text)
113
114 video_url = 'http://video2.spiegel.de/flash/' + filename
115 video_ext = filename.rpartition('.')[2]
116 info = {
117 'id': video_id,
118 'url': video_url,
119 'ext': video_ext,
120 'title': video_title,
121 'duration': duration,
122 }
123 return [info]
124
0cd35867 125class LiveLeakIE(InfoExtractor):
43113d92 126
0cd35867 127 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
43113d92 128 IE_NAME = u'liveleak'
129
130 def _real_extract(self, url):
131 mobj = re.match(self._VALID_URL, url)
132 if mobj is None:
0c021ad1 133 raise ExtractorError(u'Invalid URL: %s' % url)
43113d92 134
0cd35867 135 video_id = mobj.group('video_id')
43113d92 136
137 webpage = self._download_webpage(url, video_id)
138
ac3e9394
AB
139 video_url = self._search_regex(r'file: "(.*?)",',
140 webpage, u'video URL')
0cd35867 141
979a9dd4
FV
142 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
143 webpage, u'title').replace('LiveLeak.com -', '').strip()
43113d92 144
979a9dd4 145 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
ac3e9394 146 webpage, u'description', fatal=False)
43113d92 147
979a9dd4 148 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
ac3e9394 149 webpage, u'uploader', fatal=False)
43113d92 150
151 info = {
152 'id': video_id,
153 'url': video_url,
154 'ext': 'mp4',
ac3e9394
AB
155 'title': video_title,
156 'description': video_description,
157 'uploader': video_uploader
43113d92 158 }
159
160 return [info]
161
f2cd958c 162
f2cd958c 163
c15e0241 164class TumblrIE(InfoExtractor):
feecf225 165 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
c15e0241
JMF
166
167 def _real_extract(self, url):
168 m_url = re.match(self._VALID_URL, url)
169 video_id = m_url.group('id')
170 blog = m_url.group('blog_name')
171
172 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
173 webpage = self._download_webpage(url, video_id)
174
feecf225 175 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
c15e0241
JMF
176 video = re.search(re_video, webpage)
177 if video is None:
ac3e9394 178 raise ExtractorError(u'Unable to extract video')
c15e0241
JMF
179 video_url = video.group('video_url')
180 ext = video.group('ext')
181
ac3e9394
AB
182 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
183 webpage, u'thumbnail', fatal=False) # We pick the first poster
184 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
c15e0241
JMF
185
186 # The only place where you can get a title, it's not complete,
187 # but searching in other places doesn't work for all videos
979a9dd4 188 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
ac3e9394 189 webpage, u'title', flags=re.DOTALL)
c15e0241
JMF
190
191 return [{'id': video_id,
192 'url': video_url,
ac3e9394
AB
193 'title': video_title,
194 'thumbnail': video_thumbnail,
c15e0241
JMF
195 'ext': ext
196 }]
197
aed523ec 198class BandcampIE(InfoExtractor):
feecf225 199 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
aed523ec
JMF
200
201 def _real_extract(self, url):
202 mobj = re.match(self._VALID_URL, url)
203 title = mobj.group('title')
204 webpage = self._download_webpage(url, title)
205 # We get the link to the free download page
206 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
207 if m_download is None:
ac3e9394 208 raise ExtractorError(u'No free songs found')
0c021ad1 209
aed523ec
JMF
210 download_link = m_download.group(1)
211 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
212 webpage, re.MULTILINE|re.DOTALL).group('id')
213
214 download_webpage = self._download_webpage(download_link, id,
215 'Downloading free downloads page')
216 # We get the dictionary of the track from some javascrip code
217 info = re.search(r'items: (.*?),$',
218 download_webpage, re.MULTILINE).group(1)
219 info = json.loads(info)[0]
220 # We pick mp3-320 for now, until format selection can be easily implemented.
221 mp3_info = info[u'downloads'][u'mp3-320']
222 # If we try to use this url it says the link has expired
223 initial_url = mp3_info[u'url']
feecf225 224 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
aed523ec
JMF
225 m_url = re.match(re_url, initial_url)
226 #We build the url we will use to get the final track url
227 # This url is build in Bandcamp in the script download_bunde_*.js
228 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
229 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
230 # If we could correctly generate the .rand field the url would be
231 #in the "download_url" key
232 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
233
234 track_info = {'id':id,
235 'title' : info[u'title'],
ac3e9394
AB
236 'ext' : 'mp3',
237 'url' : final_url,
aed523ec 238 'thumbnail' : info[u'thumb_url'],
ac3e9394 239 'uploader' : info[u'artist']
aed523ec
JMF
240 }
241
242 return [track_info]
243
c34407d1 244class RedTubeIE(InfoExtractor):
5e34d2eb
YUK
245 """Information Extractor for redtube"""
246 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
5e34d2eb
YUK
247
248 def _real_extract(self,url):
249 mobj = re.match(self._VALID_URL, url)
250 if mobj is None:
c34407d1
PH
251 raise ExtractorError(u'Invalid URL: %s' % url)
252
5e34d2eb
YUK
253 video_id = mobj.group('id')
254 video_extension = 'mp4'
255 webpage = self._download_webpage(url, video_id)
ac3e9394 256
5e34d2eb 257 self.report_extraction(video_id)
c34407d1 258
979a9dd4 259 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
ac3e9394 260 webpage, u'video URL')
c34407d1 261
979a9dd4 262 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
ac3e9394 263 webpage, u'title')
5e34d2eb
YUK
264
265 return [{
266 'id': video_id,
267 'url': video_url,
268 'ext': video_extension,
269 'title': video_title,
270 }]
7f5bd09b 271
272class InaIE(InfoExtractor):
273 """Information Extractor for Ina.fr"""
d4f76f16 274 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
dfb9323c 275
7f5bd09b 276 def _real_extract(self,url):
277 mobj = re.match(self._VALID_URL, url)
dfb9323c 278
7f5bd09b 279 video_id = mobj.group('id')
dfb9323c
PH
280 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
281 video_extension = 'mp4'
282 webpage = self._download_webpage(mrss_url, video_id)
7f5bd09b 283
ac3e9394 284 self.report_extraction(video_id)
dfb9323c 285
979a9dd4 286 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
ac3e9394
AB
287 webpage, u'video URL')
288
289 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
290 webpage, u'title')
7f5bd09b 291
292 return [{
293 'id': video_id,
294 'url': video_url,
295 'ext': video_extension,
296 'title': video_title,
297 }]
e32b06e9 298
d4f76f16 299class HowcastIE(InfoExtractor):
5b0d3cc0
AB
300 """Information Extractor for Howcast.com"""
301 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
d4f76f16
FV
302
303 def _real_extract(self, url):
304 mobj = re.match(self._VALID_URL, url)
305
306 video_id = mobj.group('id')
307 webpage_url = 'http://www.howcast.com/videos/' + video_id
308 webpage = self._download_webpage(webpage_url, video_id)
309
b1d568f0
JMF
310 self.report_extraction(video_id)
311
ac3e9394
AB
312 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
313 webpage, u'video URL')
d4f76f16 314
979a9dd4 315 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
ac3e9394 316 webpage, u'title')
d4f76f16 317
979a9dd4 318 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
ac3e9394 319 webpage, u'description', fatal=False)
d4f76f16 320
979a9dd4 321 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
ac3e9394 322 webpage, u'thumbnail', fatal=False)
b1d568f0 323
d4f76f16
FV
324 return [{
325 'id': video_id,
326 'url': video_url,
327 'ext': 'mp4',
328 'title': video_title,
329 'description': video_description,
b1d568f0 330 'thumbnail': thumbnail,
d4f76f16
FV
331 }]
332
5b0d3cc0
AB
333class VineIE(InfoExtractor):
334 """Information Extractor for Vine.co"""
335 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
336
337 def _real_extract(self, url):
5b0d3cc0
AB
338 mobj = re.match(self._VALID_URL, url)
339
340 video_id = mobj.group('id')
341 webpage_url = 'https://vine.co/v/' + video_id
342 webpage = self._download_webpage(webpage_url, video_id)
343
17bd1b2f
JMF
344 self.report_extraction(video_id)
345
979a9dd4 346 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
ac3e9394 347 webpage, u'video URL')
5b0d3cc0 348
979a9dd4 349 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
ac3e9394 350 webpage, u'title')
5b0d3cc0 351
979a9dd4 352 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
ac3e9394 353 webpage, u'thumbnail', fatal=False)
17bd1b2f 354
979a9dd4 355 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
ac3e9394 356 webpage, u'uploader', fatal=False, flags=re.DOTALL)
17bd1b2f 357
5b0d3cc0 358 return [{
17bd1b2f
JMF
359 'id': video_id,
360 'url': video_url,
361 'ext': 'mp4',
362 'title': video_title,
363 'thumbnail': thumbnail,
364 'uploader': uploader,
5b0d3cc0
AB
365 }]
366
afef36c9
AB
367class FlickrIE(InfoExtractor):
368 """Information Extractor for Flickr videos"""
51d2453c 369 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
afef36c9
AB
370
371 def _real_extract(self, url):
372 mobj = re.match(self._VALID_URL, url)
373
374 video_id = mobj.group('id')
375 video_uploader_id = mobj.group('uploader_id')
376 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
377 webpage = self._download_webpage(webpage_url, video_id)
378
ac3e9394 379 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
afef36c9
AB
380
381 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
51d2453c 382 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
afef36c9 383
979a9dd4 384 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
ac3e9394 385 first_xml, u'node_id')
afef36c9
AB
386
387 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
51d2453c
FV
388 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
389
390 self.report_extraction(video_id)
afef36c9
AB
391
392 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
393 if mobj is None:
394 raise ExtractorError(u'Unable to extract video url')
395 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
396
979a9dd4 397 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
ac3e9394 398 webpage, u'video title')
afef36c9 399
979a9dd4 400 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
ac3e9394 401 webpage, u'description', fatal=False)
afef36c9 402
979a9dd4 403 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
ac3e9394 404 webpage, u'thumbnail', fatal=False)
afef36c9
AB
405
406 return [{
51d2453c
FV
407 'id': video_id,
408 'url': video_url,
409 'ext': 'mp4',
410 'title': video_title,
afef36c9 411 'description': video_description,
51d2453c 412 'thumbnail': thumbnail,
afef36c9
AB
413 'uploader_id': video_uploader_id,
414 }]
415
45014296
JMF
416class TeamcocoIE(InfoExtractor):
417 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
418
419 def _real_extract(self, url):
420 mobj = re.match(self._VALID_URL, url)
421 if mobj is None:
422 raise ExtractorError(u'Invalid URL: %s' % url)
423 url_title = mobj.group('url_title')
424 webpage = self._download_webpage(url, url_title)
425
979a9dd4 426 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
ac3e9394 427 webpage, u'video id')
45014296
JMF
428
429 self.report_extraction(video_id)
430
979a9dd4 431 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
ac3e9394 432 webpage, u'title')
45014296 433
979a9dd4 434 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
ac3e9394 435 webpage, u'thumbnail', fatal=False)
45014296 436
979a9dd4 437 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
ac3e9394 438 webpage, u'description', fatal=False)
45014296
JMF
439
440 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
441 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
ac3e9394 442
979a9dd4 443 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
ac3e9394 444 data, u'video URL')
45014296
JMF
445
446 return [{
447 'id': video_id,
448 'url': video_url,
449 'ext': 'mp4',
450 'title': video_title,
451 'thumbnail': thumbnail,
ac3e9394 452 'description': video_description,
45014296 453 }]
84095012 454
71e458d4
YUK
455class XHamsterIE(InfoExtractor):
456 """Information Extractor for xHamster"""
457 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
458
459 def _real_extract(self,url):
460 mobj = re.match(self._VALID_URL, url)
461
462 video_id = mobj.group('id')
84095012 463 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
71e458d4 464 webpage = self._download_webpage(mrss_url, video_id)
84095012 465
71e458d4
YUK
466 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
467 if mobj is None:
468 raise ExtractorError(u'Unable to extract media URL')
469 if len(mobj.group('server')) == 0:
470 video_url = compat_urllib_parse.unquote(mobj.group('file'))
471 else:
472 video_url = mobj.group('server')+'/key='+mobj.group('file')
473 video_extension = video_url.split('.')[-1]
474
979a9dd4 475 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
84095012 476 webpage, u'title')
71e458d4 477
8b59a986 478 # Can't see the description anywhere in the UI
979a9dd4 479 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
8b59a986
FV
480 # webpage, u'description', fatal=False)
481 # if video_description: video_description = unescapeHTML(video_description)
71e458d4
YUK
482
483 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
84095012
FV
484 if mobj:
485 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
71e458d4 486 else:
84095012
FV
487 video_upload_date = None
488 self._downloader.report_warning(u'Unable to extract upload date')
71e458d4 489
78d3442b 490 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
84095012
FV
491 webpage, u'uploader id', default=u'anonymous')
492
493 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
494 webpage, u'thumbnail', fatal=False)
71e458d4
YUK
495
496 return [{
497 'id': video_id,
498 'url': video_url,
499 'ext': video_extension,
500 'title': video_title,
8b59a986 501 # 'description': video_description,
71e458d4
YUK
502 'upload_date': video_upload_date,
503 'uploader_id': video_uploader_id,
504 'thumbnail': video_thumbnail
505 }]
afef36c9 506
157b864a
YK
507class HypemIE(InfoExtractor):
508 """Information Extractor for hypem"""
509 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
510
868d62a5 511 def _real_extract(self, url):
157b864a
YK
512 mobj = re.match(self._VALID_URL, url)
513 if mobj is None:
514 raise ExtractorError(u'Invalid URL: %s' % url)
868d62a5
FV
515 track_id = mobj.group(1)
516
517 data = { 'ax': 1, 'ts': time.time() }
157b864a 518 data_encoded = compat_urllib_parse.urlencode(data)
868d62a5 519 complete_url = url + "?" + data_encoded
157b864a 520 request = compat_urllib_request.Request(complete_url)
868d62a5 521 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
157b864a 522 cookie = urlh.headers.get('Set-Cookie', '')
868d62a5
FV
523
524 self.report_extraction(track_id)
84095012 525
979a9dd4 526 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
84095012 527 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
157b864a
YK
528 try:
529 track_list = json.loads(html_tracks)
868d62a5 530 track = track_list[u'tracks'][0]
157b864a 531 except ValueError:
868d62a5
FV
532 raise ExtractorError(u'Hypemachine contained invalid JSON.')
533
534 key = track[u"key"]
535 track_id = track[u"id"]
536 artist = track[u"artist"]
537 title = track[u"song"]
538
539 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
157b864a
YK
540 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
541 request.add_header('cookie', cookie)
868d62a5
FV
542 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
543 try:
544 song_data = json.loads(song_data_json)
545 except ValueError:
546 raise ExtractorError(u'Hypemachine contained invalid JSON.')
157b864a 547 final_url = song_data[u"url"]
868d62a5 548
157b864a 549 return [{
868d62a5 550 'id': track_id,
157b864a
YK
551 'url': final_url,
552 'ext': "mp3",
553 'title': title,
554 'artist': artist,
555 }]
556
ecb3e676
YK
557class Vbox7IE(InfoExtractor):
558 """Information Extractor for Vbox7"""
559 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
560
561 def _real_extract(self,url):
562 mobj = re.match(self._VALID_URL, url)
563 if mobj is None:
564 raise ExtractorError(u'Invalid URL: %s' % url)
565 video_id = mobj.group(1)
566
567 redirect_page, urlh = self._download_webpage_handle(url, video_id)
0251f9c9
FV
568 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
569 redirect_url = urlh.geturl() + new_location
ecb3e676
YK
570 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
571
0251f9c9
FV
572 title = self._html_search_regex(r'<title>(.*)</title>',
573 webpage, u'title').split('/')[0].strip()
ecb3e676
YK
574
575 ext = "flv"
576 info_url = "http://vbox7.com/play/magare.do"
577 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
578 info_request = compat_urllib_request.Request(info_url, data)
579 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
580 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
581 if info_response is None:
582 raise ExtractorError(u'Unable to extract the media url')
cd453d38 583 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
ecb3e676
YK
584
585 return [{
cd453d38
JMF
586 'id': video_id,
587 'url': final_url,
588 'ext': ext,
589 'title': title,
590 'thumbnail': thumbnail_url,
ecb3e676 591 }]
157b864a 592
32aa88bc 593
4aeae91f
PH
594def gen_extractors():
595 """ Return a list of an instance of every supported extractor.
596 The order does matter; the first extractor matched is the one handling the URL.
597 """
598 return [
599 YoutubePlaylistIE(),
600 YoutubeChannelIE(),
601 YoutubeUserIE(),
602 YoutubeSearchIE(),
603 YoutubeIE(),
604 MetacafeIE(),
605 DailymotionIE(),
606 GoogleSearchIE(),
607 PhotobucketIE(),
608 YahooIE(),
609 YahooSearchIE(),
610 DepositFilesIE(),
611 FacebookIE(),
4aeae91f 612 BlipTVIE(),
1b2b22ed 613 BlipTVUserIE(),
4aeae91f
PH
614 VimeoIE(),
615 MyVideoIE(),
616 ComedyCentralIE(),
617 EscapistIE(),
618 CollegeHumorIE(),
619 XVideosIE(),
5011cded 620 SoundcloudSetIE(),
4aeae91f
PH
621 SoundcloudIE(),
622 InfoQIE(),
623 MixcloudIE(),
624 StanfordOpenClassroomIE(),
625 MTVIE(),
626 YoukuIE(),
627 XNXXIE(),
18be482a
JC
628 YouJizzIE(),
629 PornotubeIE(),
630 YouPornIE(),
4aeae91f
PH
631 GooglePlusIE(),
632 ArteTvIE(),
633 NBAIE(),
40634747 634 WorldStarHipHopIE(),
4aeae91f
PH
635 JustinTVIE(),
636 FunnyOrDieIE(),
4aeae91f
PH
637 SteamIE(),
638 UstreamIE(),
ca0a0bbe 639 RBMARadioIE(),
ccf65f9d 640 EightTracksIE(),
da06e2da 641 KeekIE(),
3a468f2d 642 TEDIE(),
58994225 643 MySpassIE(),
e32b06e9 644 SpiegelIE(),
0cd35867 645 LiveLeakIE(),
df2dedee 646 ARDIE(),
f2cd958c 647 ZDFIE(),
c15e0241 648 TumblrIE(),
aed523ec 649 BandcampIE(),
c34407d1 650 RedTubeIE(),
dfb9323c 651 InaIE(),
d4f76f16 652 HowcastIE(),
5b0d3cc0 653 VineIE(),
afef36c9 654 FlickrIE(),
45014296 655 TeamcocoIE(),
71e458d4 656 XHamsterIE(),
157b864a 657 HypemIE(),
ecb3e676 658 Vbox7IE(),
32aa88bc 659 GametrailersIE(),
6b3f5a32 660 StatigramIE(),
4aeae91f
PH
661 GenericIE()
662 ]
93412126
JMF
663
664def get_info_extractor(ie_name):
665 """Returns the info extractor class with the given ie_name"""
666 return globals()[ie_name+'IE']