]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Move FunnyOrDie into its own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
4fcca4bb 1import base64
d77c3dfd 2import datetime
ccf65f9d 3import itertools
d77c3dfd
FV
4import netrc
5import os
6import re
7import socket
8import time
d77c3dfd 9import email.utils
921a1455 10import xml.etree.ElementTree
302efc19 11import random
12import math
6324fd1d 13import operator
de5d66d4 14import hashlib
15import binascii
16import urllib
d77c3dfd 17
9e8056d5 18from .utils import *
d6983cb4 19from .extractor.common import InfoExtractor, SearchInfoExtractor
d5822b96
PH
20
21from .extractor.ard import ARDIE
22from .extractor.arte import ArteTvIE
f5884801 23from .extractor.bliptv import BlipTVIE, BlipTVUserIE
ea63e499 24from .extractor.comedycentral import ComedyCentralIE
7beb36a5 25from .extractor.collegehumor import CollegeHumorIE
219b8130 26from .extractor.dailymotion import DailymotionIE
426ff042 27from .extractor.depositfiles import DepositFilesIE
15369766 28from .extractor.escapist import EscapistIE
a50e1b32 29from .extractor.facebook import FacebookIE
8f0578f0 30from .extractor.funnyordie import FunnyOrDieIE
9f4e6bba 31from .extractor.gametrailers import GametrailersIE
9b122384 32from .extractor.generic import GenericIE
7aca14a1
PH
33from .extractor.googleplus import GooglePlusIE
34from .extractor.googlesearch import GoogleSearchIE
fda7d31a 35from .extractor.infoq import InfoQIE
38cbc40a 36from .extractor.metacafe import MetacafeIE
80cbb6dd 37from .extractor.mixcloud import MixcloudIE
33505666 38from .extractor.mtv import MTVIE
a08dfd27 39from .extractor.myvideo import MyVideoIE
5b286728 40from .extractor.nba import NBAIE
38cbc40a 41from .extractor.statigram import StatigramIE
97d6faac 42from .extractor.photobucket import PhotobucketIE
aad0d6d5 43from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
291a168b 44from .extractor.stanfordoc import StanfordOpenClassroomIE
462dc88b 45from .extractor.steam import SteamIE
9fd5ce0c 46from .extractor.ted import TEDIE
b3d14cbf 47from .extractor.vimeo import VimeoIE
250f5578 48from .extractor.worldstarhiphop import WorldStarHipHopIE
462dc88b 49from .extractor.xnxx import XNXXIE
cbf46c73 50from .extractor.xvideos import XVideosIE
934858ad 51from .extractor.yahoo import YahooIE, YahooSearchIE
9c286cfa 52from .extractor.youku import YoukuIE
b05654f0 53from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
d5822b96 54from .extractor.zdf import ZDFIE
e30e9318 55
d830b7c2 56
d77c3dfd 57
d77c3dfd 58
d77c3dfd 59
6de7ef9b 60
302efc19 61
5dc846fa
FV
62
63
fd873c69
FV
64
65
4cc3d074 66
0b40544f
DV
67
68class JustinTVIE(InfoExtractor):
69 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
70 # TODO: One broadcast may be split into multiple videos. The key
71 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
72 # starts at 1 and increases. Can we treat all parts as one video?
73
4096b609 74 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
0e16f094
PH
75 (?:
76 (?P<channelid>[^/]+)|
77 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
78 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
79 )
80 /?(?:\#.*)?$
81 """
4096b609 82 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
83 IE_NAME = u'justin.tv'
84
4096b609
DV
85 def report_download_page(self, channel, offset):
86 """Report attempt to download a single page of videos."""
f17ce13a
JMF
87 self.to_screen(u'%s: Downloading video information from %d to %d' %
88 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
4096b609 89
2ab1c5ed 90 # Return count of items, list of *valid* items
46bfb422
JMF
91 def _parse_page(self, url, video_id):
92 webpage = self._download_webpage(url, video_id,
93 u'Downloading video info JSON',
94 u'unable to download video info JSON')
cdb30764 95
0b40544f 96 response = json.loads(webpage)
fa1bf9c6 97 if type(response) != list:
98 error_text = response.get('error', 'unknown error')
decd1d17 99 raise ExtractorError(u'Justin.tv API: %s' % error_text)
0b40544f
DV
100 info = []
101 for clip in response:
102 video_url = clip['video_file_url']
103 if video_url:
104 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 105 video_date = re.sub('-', '', clip['start_time'][:10])
106 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
107 video_id = clip['id']
108 video_title = clip.get('title', video_id)
0b40544f 109 info.append({
97f194c1 110 'id': video_id,
0b40544f 111 'url': video_url,
97f194c1 112 'title': video_title,
fa1bf9c6 113 'uploader': clip.get('channel_name', video_uploader_id),
114 'uploader_id': video_uploader_id,
0b40544f
DV
115 'upload_date': video_date,
116 'ext': video_extension,
117 })
2ab1c5ed
DV
118 return (len(response), info)
119
120 def _real_extract(self, url):
121 mobj = re.match(self._VALID_URL, url)
122 if mobj is None:
0e16f094 123 raise ExtractorError(u'invalid URL: %s' % url)
cdb30764 124
0e16f094 125 api_base = 'http://api.justin.tv'
2ab1c5ed 126 paged = False
0e16f094 127 if mobj.group('channelid'):
2ab1c5ed 128 paged = True
0e16f094
PH
129 video_id = mobj.group('channelid')
130 api = api_base + '/channel/archives/%s.json' % video_id
131 elif mobj.group('chapterid'):
132 chapter_id = mobj.group('chapterid')
0e16f094
PH
133
134 webpage = self._download_webpage(url, chapter_id)
135 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
136 if not m:
f4f31688 137 raise ExtractorError(u'Cannot find archive of a chapter')
0e16f094 138 archive_id = m.group(1)
f4f31688
PH
139
140 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
141 chapter_info_xml = self._download_webpage(api, chapter_id,
142 note=u'Downloading chapter information',
143 errnote=u'Chapter information download failed')
144 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
145 for a in doc.findall('.//archive'):
146 if archive_id == a.find('./id').text:
147 break
148 else:
149 raise ExtractorError(u'Could not find chapter in chapter information')
150
151 video_url = a.find('./video_file_url').text
152 video_ext = video_url.rpartition('.')[2] or u'flv'
153
db8fd71c 154 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
c43e5724 155 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
db8fd71c
PH
156 note='Downloading chapter metadata',
157 errnote='Download of chapter metadata failed')
158 chapter_info = json.loads(chapter_info_json)
159
4539dd30
PH
160 bracket_start = int(doc.find('.//bracket_start').text)
161 bracket_end = int(doc.find('.//bracket_end').text)
c43e5724 162
f4f31688
PH
163 # TODO determine start (and probably fix up file)
164 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
4539dd30
PH
165 #video_url += u'?start=' + TODO:start_timestamp
166 # bracket_start is 13290, but we want 51670615
167 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
168 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
0e16f094 169
f4f31688
PH
170 info = {
171 'id': u'c' + chapter_id,
172 'url': video_url,
173 'ext': video_ext,
db8fd71c
PH
174 'title': chapter_info['title'],
175 'thumbnail': chapter_info['preview'],
176 'description': chapter_info['description'],
c43e5724
PH
177 'uploader': chapter_info['channel']['display_name'],
178 'uploader_id': chapter_info['channel']['name'],
f4f31688
PH
179 }
180 return [info]
2ab1c5ed 181 else:
0e16f094
PH
182 video_id = mobj.group('videoid')
183 api = api_base + '/broadcast/by_archive/%s.json' % video_id
cdb30764 184
2ab1c5ed 185 self.report_extraction(video_id)
cdb30764 186
2ab1c5ed
DV
187 info = []
188 offset = 0
4096b609
DV
189 limit = self._JUSTIN_PAGE_LIMIT
190 while True:
191 if paged:
192 self.report_download_page(video_id, offset)
2ab1c5ed 193 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
46bfb422 194 page_count, page_info = self._parse_page(page_url, video_id)
2ab1c5ed
DV
195 info.extend(page_info)
196 if not paged or page_count != limit:
197 break
198 offset += limit
0b40544f 199 return info
21a9c6aa 200
d0d4f277 201
ef0c8d5f 202
278986ea 203class UstreamIE(InfoExtractor):
ef0c8d5f 204 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 205 IE_NAME = u'ustream'
ef0c8d5f 206
278986ea
JMF
207 def _real_extract(self, url):
208 m = re.match(self._VALID_URL, url)
209 video_id = m.group('videoID')
ac3e9394 210
278986ea 211 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 212 webpage = self._download_webpage(url, video_id)
ac3e9394 213
340fa211 214 self.report_extraction(video_id)
ac3e9394 215
979a9dd4 216 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
ac3e9394
AB
217 webpage, u'title')
218
979a9dd4 219 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
ac3e9394 220 webpage, u'uploader', fatal=False, flags=re.DOTALL)
ac3e9394 221
979a9dd4 222 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
ac3e9394
AB
223 webpage, u'thumbnail', fatal=False)
224
278986ea 225 info = {
ac3e9394
AB
226 'id': video_id,
227 'url': video_url,
278986ea 228 'ext': 'flv',
ac3e9394 229 'title': video_title,
340fa211 230 'uploader': uploader,
ac3e9394
AB
231 'thumbnail': thumbnail,
232 }
340fa211 233 return info
4aeae91f 234
40634747 235
ca0a0bbe
PH
236class RBMARadioIE(InfoExtractor):
237 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
238
239 def _real_extract(self, url):
240 m = re.match(self._VALID_URL, url)
241 video_id = m.group('videoID')
242
243 webpage = self._download_webpage(url, video_id)
ac3e9394 244
038a3a1a
JMF
245 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
246 webpage, u'json data', flags=re.MULTILINE)
ca0a0bbe
PH
247
248 try:
249 data = json.loads(json_data)
250 except ValueError as e:
251 raise ExtractorError(u'Invalid JSON: ' + str(e))
252
253 video_url = data['akamai_url'] + '&cbr=256'
254 url_parts = compat_urllib_parse_urlparse(video_url)
255 video_ext = url_parts.path.rpartition('.')[2]
256 info = {
257 'id': video_id,
258 'url': video_url,
259 'ext': video_ext,
260 'title': data['title'],
261 'description': data.get('teaser_text'),
262 'location': data.get('country_of_origin'),
263 'uploader': data.get('host', {}).get('name'),
264 'uploader_id': data.get('host', {}).get('slug'),
187f491a 265 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
266 'duration': data.get('duration'),
267 }
268 return [info]
4aeae91f 269
991ba7fa
JC
270
271class YouPornIE(InfoExtractor):
272 """Information extractor for youporn.com."""
991ba7fa 273 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
6324fd1d 274
991ba7fa
JC
275 def _print_formats(self, formats):
276 """Print all available formats"""
565f7519 277 print(u'Available formats:')
ca6710ee
JC
278 print(u'ext\t\tformat')
279 print(u'---------------------------------')
991ba7fa 280 for format in formats:
ca6710ee 281 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
282
283 def _specific(self, req_format, formats):
284 for x in formats:
285 if(x["format"]==req_format):
286 return x
287 return None
288
991ba7fa
JC
289 def _real_extract(self, url):
290 mobj = re.match(self._VALID_URL, url)
291 if mobj is None:
0c021ad1 292 raise ExtractorError(u'Invalid URL: %s' % url)
ca6710ee 293 video_id = mobj.group('videoid')
991ba7fa 294
629fcdd1
PH
295 req = compat_urllib_request.Request(url)
296 req.add_header('Cookie', 'age_verified=1')
297 webpage = self._download_webpage(req, video_id)
991ba7fa 298
be95cac1
FV
299 # Get JSON parameters
300 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
301 try:
302 params = json.loads(json_params)
303 except:
304 raise ExtractorError(u'Invalid JSON')
991ba7fa 305
be95cac1
FV
306 self.report_extraction(video_id)
307 try:
308 video_title = params['title']
309 upload_date = unified_strdate(params['release_date_f'])
310 video_description = params['description']
311 video_uploader = params['submitted_by']
312 thumbnail = params['thumbnails'][0]['image']
313 except KeyError:
314 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
991ba7fa
JC
315
316 # Get all of the formats available
ca6710ee 317 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
ac3e9394
AB
318 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
319 webpage, u'download list').strip()
991ba7fa
JC
320
321 # Get all of the links from the page
ca6710ee
JC
322 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
323 links = re.findall(LINK_RE, download_list_html)
991ba7fa 324 if(len(links) == 0):
629fcdd1 325 raise ExtractorError(u'ERROR: no known formats available for video')
6324fd1d 326
f17ce13a 327 self.to_screen(u'Links found: %d' % len(links))
991ba7fa
JC
328
329 formats = []
330 for link in links:
331
332 # A link looks like this:
333 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
334 # A path looks like this:
335 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
336 video_url = unescapeHTML( link )
337 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
338 extension = os.path.splitext( path )[1][1:]
339 format = path.split('/')[4].split('_')[:2]
340 size = format[0]
341 bitrate = format[1]
342 format = "-".join( format )
78d3442b 343 # title = u'%s-%s-%s' % (video_title, size, bitrate)
991ba7fa
JC
344
345 formats.append({
346 'id': video_id,
347 'url': video_url,
348 'uploader': video_uploader,
349 'upload_date': upload_date,
78d3442b 350 'title': video_title,
991ba7fa
JC
351 'ext': extension,
352 'format': format,
be95cac1
FV
353 'thumbnail': thumbnail,
354 'description': video_description
991ba7fa
JC
355 })
356
357 if self._downloader.params.get('listformats', None):
358 self._print_formats(formats)
359 return
360
361 req_format = self._downloader.params.get('format', None)
f17ce13a 362 self.to_screen(u'Format: %s' % req_format)
991ba7fa 363
991ba7fa
JC
364 if req_format is None or req_format == 'best':
365 return [formats[0]]
366 elif req_format == 'worst':
367 return [formats[-1]]
368 elif req_format in ('-1', 'all'):
369 return formats
370 else:
371 format = self._specific( req_format, formats )
372 if result is None:
0c021ad1 373 raise ExtractorError(u'Requested format not available')
991ba7fa
JC
374 return [format]
375
6324fd1d 376
991ba7fa
JC
377
378class PornotubeIE(InfoExtractor):
379 """Information extractor for pornotube.com."""
991ba7fa 380 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 381
991ba7fa
JC
382 def _real_extract(self, url):
383 mobj = re.match(self._VALID_URL, url)
384 if mobj is None:
0c021ad1 385 raise ExtractorError(u'Invalid URL: %s' % url)
991ba7fa 386
ca6710ee
JC
387 video_id = mobj.group('videoid')
388 video_title = mobj.group('title')
991ba7fa
JC
389
390 # Get webpage content
ca6710ee 391 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
392
393 # Get the video URL
ca6710ee 394 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
ac3e9394
AB
395 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
396 video_url = compat_urllib_parse.unquote(video_url)
991ba7fa
JC
397
398 #Get the uploaded date
ca6710ee 399 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
979a9dd4 400 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
ac3e9394 401 if upload_date: upload_date = unified_strdate(upload_date)
991ba7fa
JC
402
403 info = {'id': video_id,
404 'url': video_url,
405 'uploader': None,
406 'upload_date': upload_date,
407 'title': video_title,
408 'ext': 'flv',
565f7519 409 'format': 'flv'}
991ba7fa
JC
410
411 return [info]
412
991ba7fa
JC
413class YouJizzIE(InfoExtractor):
414 """Information extractor for youjizz.com."""
ca6710ee 415 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 416
991ba7fa 417 def _real_extract(self, url):
ca6710ee
JC
418 mobj = re.match(self._VALID_URL, url)
419 if mobj is None:
0c021ad1 420 raise ExtractorError(u'Invalid URL: %s' % url)
ca6710ee
JC
421
422 video_id = mobj.group('videoid')
423
424 # Get webpage content
425 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
426
427 # Get the video title
979a9dd4 428 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
ac3e9394 429 webpage, u'title').strip()
991ba7fa
JC
430
431 # Get the embed page
db16276b 432 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 433 if result is None:
db16276b 434 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 435
ca6710ee
JC
436 embed_page_url = result.group(0).strip()
437 video_id = result.group('videoid')
6324fd1d 438
ca6710ee
JC
439 webpage = self._download_webpage(embed_page_url, video_id)
440
991ba7fa 441 # Get the video URL
ac3e9394
AB
442 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
443 webpage, u'video URL')
991ba7fa
JC
444
445 info = {'id': video_id,
446 'url': video_url,
991ba7fa
JC
447 'title': video_title,
448 'ext': 'flv',
449 'format': 'flv',
991ba7fa
JC
450 'player_url': embed_page_url}
451
452 return [info]
453
ccf65f9d
PH
454class EightTracksIE(InfoExtractor):
455 IE_NAME = '8tracks'
25580f32 456 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
ccf65f9d
PH
457
458 def _real_extract(self, url):
459 mobj = re.match(self._VALID_URL, url)
460 if mobj is None:
461 raise ExtractorError(u'Invalid URL: %s' % url)
462 playlist_id = mobj.group('id')
463
464 webpage = self._download_webpage(url, playlist_id)
465
ac3e9394 466 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
ccf65f9d
PH
467 data = json.loads(json_like)
468
469 session = str(random.randint(0, 1000000000))
470 mix_id = data['id']
471 track_count = data['tracks_count']
472 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
473 next_url = first_url
474 res = []
475 for i in itertools.count():
476 api_json = self._download_webpage(next_url, playlist_id,
477 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
478 errnote=u'Failed to download song information')
479 api_data = json.loads(api_json)
480 track_data = api_data[u'set']['track']
481 info = {
482 'id': track_data['id'],
483 'url': track_data['track_file_stream_url'],
da4de959
PH
484 'title': track_data['performer'] + u' - ' + track_data['name'],
485 'raw_title': track_data['name'],
486 'uploader_id': data['user']['login'],
ccf65f9d
PH
487 'ext': 'm4a',
488 }
489 res.append(info)
490 if api_data['set']['at_last_track']:
491 break
492 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
493 return res
991ba7fa 494
da06e2da
OK
495class KeekIE(InfoExtractor):
496 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
497 IE_NAME = u'keek'
498
499 def _real_extract(self, url):
500 m = re.match(self._VALID_URL, url)
501 video_id = m.group('videoID')
ac3e9394 502
da06e2da
OK
503 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
504 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
505 webpage = self._download_webpage(url, video_id)
ac3e9394 506
979a9dd4 507 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
ac3e9394 508 webpage, u'title')
ac3e9394 509
979a9dd4 510 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
ac3e9394 511 webpage, u'uploader', fatal=False)
ac3e9394 512
da06e2da 513 info = {
f10b2a9c
FV
514 'id': video_id,
515 'url': video_url,
da06e2da 516 'ext': 'mp4',
ac3e9394 517 'title': video_title,
da06e2da
OK
518 'thumbnail': thumbnail,
519 'uploader': uploader
f0877a44 520 }
da06e2da
OK
521 return [info]
522
da06e2da 523
58994225 524class MySpassIE(InfoExtractor):
1ad5d872 525 _VALID_URL = r'http://www.myspass.de/.*'
6324fd1d 526
1ad5d872 527 def _real_extract(self, url):
528 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
58994225 529
1ad5d872 530 # video id is the last path element of the URL
531 # usually there is a trailing slash, so also try the second but last
532 url_path = compat_urllib_parse_urlparse(url).path
533 url_parent_path, video_id = os.path.split(url_path)
534 if not video_id:
535 _, video_id = os.path.split(url_parent_path)
6324fd1d 536
1ad5d872 537 # get metadata
538 metadata_url = META_DATA_URL_TEMPLATE % video_id
539 metadata_text = self._download_webpage(metadata_url, video_id)
540 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
6324fd1d 541
1ad5d872 542 # extract values from metadata
543 url_flv_el = metadata.find('url_flv')
544 if url_flv_el is None:
0c021ad1 545 raise ExtractorError(u'Unable to extract download url')
1ad5d872 546 video_url = url_flv_el.text
547 extension = os.path.splitext(video_url)[1][1:]
548 title_el = metadata.find('title')
549 if title_el is None:
0c021ad1 550 raise ExtractorError(u'Unable to extract title')
1ad5d872 551 title = title_el.text
552 format_id_el = metadata.find('format_id')
553 if format_id_el is None:
554 format = ext
555 else:
556 format = format_id_el.text
557 description_el = metadata.find('description')
558 if description_el is not None:
559 description = description_el.text
560 else:
561 description = None
562 imagePreview_el = metadata.find('imagePreview')
563 if imagePreview_el is not None:
564 thumbnail = imagePreview_el.text
565 else:
566 thumbnail = None
567 info = {
568 'id': video_id,
569 'url': video_url,
570 'title': title,
571 'ext': extension,
572 'format': format,
573 'thumbnail': thumbnail,
574 'description': description
575 }
576 return [info]
577
e32b06e9 578class SpiegelIE(InfoExtractor):
1f46c152 579 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
e32b06e9
PH
580
581 def _real_extract(self, url):
582 m = re.match(self._VALID_URL, url)
583 video_id = m.group('videoID')
584
585 webpage = self._download_webpage(url, video_id)
ac3e9394 586
979a9dd4 587 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
ac3e9394 588 webpage, u'title')
e32b06e9
PH
589
590 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
591 xml_code = self._download_webpage(xml_url, video_id,
592 note=u'Downloading XML', errnote=u'Failed to download XML')
593
594 idoc = xml.etree.ElementTree.fromstring(xml_code)
595 last_type = idoc[-1]
596 filename = last_type.findall('./filename')[0].text
597 duration = float(last_type.findall('./duration')[0].text)
598
599 video_url = 'http://video2.spiegel.de/flash/' + filename
600 video_ext = filename.rpartition('.')[2]
601 info = {
602 'id': video_id,
603 'url': video_url,
604 'ext': video_ext,
605 'title': video_title,
606 'duration': duration,
607 }
608 return [info]
609
0cd35867 610class LiveLeakIE(InfoExtractor):
43113d92 611
0cd35867 612 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
43113d92 613 IE_NAME = u'liveleak'
614
615 def _real_extract(self, url):
616 mobj = re.match(self._VALID_URL, url)
617 if mobj is None:
0c021ad1 618 raise ExtractorError(u'Invalid URL: %s' % url)
43113d92 619
0cd35867 620 video_id = mobj.group('video_id')
43113d92 621
622 webpage = self._download_webpage(url, video_id)
623
ac3e9394
AB
624 video_url = self._search_regex(r'file: "(.*?)",',
625 webpage, u'video URL')
0cd35867 626
979a9dd4
FV
627 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
628 webpage, u'title').replace('LiveLeak.com -', '').strip()
43113d92 629
979a9dd4 630 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
ac3e9394 631 webpage, u'description', fatal=False)
43113d92 632
979a9dd4 633 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
ac3e9394 634 webpage, u'uploader', fatal=False)
43113d92 635
636 info = {
637 'id': video_id,
638 'url': video_url,
639 'ext': 'mp4',
ac3e9394
AB
640 'title': video_title,
641 'description': video_description,
642 'uploader': video_uploader
43113d92 643 }
644
645 return [info]
646
f2cd958c 647
f2cd958c 648
c15e0241 649class TumblrIE(InfoExtractor):
feecf225 650 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
c15e0241
JMF
651
652 def _real_extract(self, url):
653 m_url = re.match(self._VALID_URL, url)
654 video_id = m_url.group('id')
655 blog = m_url.group('blog_name')
656
657 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
658 webpage = self._download_webpage(url, video_id)
659
feecf225 660 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
c15e0241
JMF
661 video = re.search(re_video, webpage)
662 if video is None:
ac3e9394 663 raise ExtractorError(u'Unable to extract video')
c15e0241
JMF
664 video_url = video.group('video_url')
665 ext = video.group('ext')
666
ac3e9394
AB
667 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
668 webpage, u'thumbnail', fatal=False) # We pick the first poster
669 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
c15e0241
JMF
670
671 # The only place where you can get a title, it's not complete,
672 # but searching in other places doesn't work for all videos
979a9dd4 673 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
ac3e9394 674 webpage, u'title', flags=re.DOTALL)
c15e0241
JMF
675
676 return [{'id': video_id,
677 'url': video_url,
ac3e9394
AB
678 'title': video_title,
679 'thumbnail': video_thumbnail,
c15e0241
JMF
680 'ext': ext
681 }]
682
aed523ec 683class BandcampIE(InfoExtractor):
feecf225 684 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
aed523ec
JMF
685
686 def _real_extract(self, url):
687 mobj = re.match(self._VALID_URL, url)
688 title = mobj.group('title')
689 webpage = self._download_webpage(url, title)
690 # We get the link to the free download page
691 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
692 if m_download is None:
ac3e9394 693 raise ExtractorError(u'No free songs found')
0c021ad1 694
aed523ec
JMF
695 download_link = m_download.group(1)
696 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
697 webpage, re.MULTILINE|re.DOTALL).group('id')
698
699 download_webpage = self._download_webpage(download_link, id,
700 'Downloading free downloads page')
701 # We get the dictionary of the track from some javascrip code
702 info = re.search(r'items: (.*?),$',
703 download_webpage, re.MULTILINE).group(1)
704 info = json.loads(info)[0]
705 # We pick mp3-320 for now, until format selection can be easily implemented.
706 mp3_info = info[u'downloads'][u'mp3-320']
707 # If we try to use this url it says the link has expired
708 initial_url = mp3_info[u'url']
feecf225 709 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
aed523ec
JMF
710 m_url = re.match(re_url, initial_url)
711 #We build the url we will use to get the final track url
712 # This url is build in Bandcamp in the script download_bunde_*.js
713 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
714 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
715 # If we could correctly generate the .rand field the url would be
716 #in the "download_url" key
717 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
718
719 track_info = {'id':id,
720 'title' : info[u'title'],
ac3e9394
AB
721 'ext' : 'mp3',
722 'url' : final_url,
aed523ec 723 'thumbnail' : info[u'thumb_url'],
ac3e9394 724 'uploader' : info[u'artist']
aed523ec
JMF
725 }
726
727 return [track_info]
728
c34407d1 729class RedTubeIE(InfoExtractor):
5e34d2eb
YUK
730 """Information Extractor for redtube"""
731 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
5e34d2eb
YUK
732
733 def _real_extract(self,url):
734 mobj = re.match(self._VALID_URL, url)
735 if mobj is None:
c34407d1
PH
736 raise ExtractorError(u'Invalid URL: %s' % url)
737
5e34d2eb
YUK
738 video_id = mobj.group('id')
739 video_extension = 'mp4'
740 webpage = self._download_webpage(url, video_id)
ac3e9394 741
5e34d2eb 742 self.report_extraction(video_id)
c34407d1 743
979a9dd4 744 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
ac3e9394 745 webpage, u'video URL')
c34407d1 746
979a9dd4 747 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
ac3e9394 748 webpage, u'title')
5e34d2eb
YUK
749
750 return [{
751 'id': video_id,
752 'url': video_url,
753 'ext': video_extension,
754 'title': video_title,
755 }]
7f5bd09b 756
757class InaIE(InfoExtractor):
758 """Information Extractor for Ina.fr"""
d4f76f16 759 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
dfb9323c 760
7f5bd09b 761 def _real_extract(self,url):
762 mobj = re.match(self._VALID_URL, url)
dfb9323c 763
7f5bd09b 764 video_id = mobj.group('id')
dfb9323c
PH
765 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
766 video_extension = 'mp4'
767 webpage = self._download_webpage(mrss_url, video_id)
7f5bd09b 768
ac3e9394 769 self.report_extraction(video_id)
dfb9323c 770
979a9dd4 771 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
ac3e9394
AB
772 webpage, u'video URL')
773
774 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
775 webpage, u'title')
7f5bd09b 776
777 return [{
778 'id': video_id,
779 'url': video_url,
780 'ext': video_extension,
781 'title': video_title,
782 }]
e32b06e9 783
d4f76f16 784class HowcastIE(InfoExtractor):
5b0d3cc0
AB
785 """Information Extractor for Howcast.com"""
786 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
d4f76f16
FV
787
788 def _real_extract(self, url):
789 mobj = re.match(self._VALID_URL, url)
790
791 video_id = mobj.group('id')
792 webpage_url = 'http://www.howcast.com/videos/' + video_id
793 webpage = self._download_webpage(webpage_url, video_id)
794
b1d568f0
JMF
795 self.report_extraction(video_id)
796
ac3e9394
AB
797 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
798 webpage, u'video URL')
d4f76f16 799
979a9dd4 800 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
ac3e9394 801 webpage, u'title')
d4f76f16 802
979a9dd4 803 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
ac3e9394 804 webpage, u'description', fatal=False)
d4f76f16 805
979a9dd4 806 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
ac3e9394 807 webpage, u'thumbnail', fatal=False)
b1d568f0 808
d4f76f16
FV
809 return [{
810 'id': video_id,
811 'url': video_url,
812 'ext': 'mp4',
813 'title': video_title,
814 'description': video_description,
b1d568f0 815 'thumbnail': thumbnail,
d4f76f16
FV
816 }]
817
5b0d3cc0
AB
818class VineIE(InfoExtractor):
819 """Information Extractor for Vine.co"""
820 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
821
822 def _real_extract(self, url):
5b0d3cc0
AB
823 mobj = re.match(self._VALID_URL, url)
824
825 video_id = mobj.group('id')
826 webpage_url = 'https://vine.co/v/' + video_id
827 webpage = self._download_webpage(webpage_url, video_id)
828
17bd1b2f
JMF
829 self.report_extraction(video_id)
830
979a9dd4 831 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
ac3e9394 832 webpage, u'video URL')
5b0d3cc0 833
979a9dd4 834 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
ac3e9394 835 webpage, u'title')
5b0d3cc0 836
979a9dd4 837 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
ac3e9394 838 webpage, u'thumbnail', fatal=False)
17bd1b2f 839
979a9dd4 840 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
ac3e9394 841 webpage, u'uploader', fatal=False, flags=re.DOTALL)
17bd1b2f 842
5b0d3cc0 843 return [{
17bd1b2f
JMF
844 'id': video_id,
845 'url': video_url,
846 'ext': 'mp4',
847 'title': video_title,
848 'thumbnail': thumbnail,
849 'uploader': uploader,
5b0d3cc0
AB
850 }]
851
afef36c9
AB
852class FlickrIE(InfoExtractor):
853 """Information Extractor for Flickr videos"""
51d2453c 854 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
afef36c9
AB
855
856 def _real_extract(self, url):
857 mobj = re.match(self._VALID_URL, url)
858
859 video_id = mobj.group('id')
860 video_uploader_id = mobj.group('uploader_id')
861 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
862 webpage = self._download_webpage(webpage_url, video_id)
863
ac3e9394 864 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
afef36c9
AB
865
866 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
51d2453c 867 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
afef36c9 868
979a9dd4 869 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
ac3e9394 870 first_xml, u'node_id')
afef36c9
AB
871
872 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
51d2453c
FV
873 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
874
875 self.report_extraction(video_id)
afef36c9
AB
876
877 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
878 if mobj is None:
879 raise ExtractorError(u'Unable to extract video url')
880 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
881
979a9dd4 882 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
ac3e9394 883 webpage, u'video title')
afef36c9 884
979a9dd4 885 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
ac3e9394 886 webpage, u'description', fatal=False)
afef36c9 887
979a9dd4 888 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
ac3e9394 889 webpage, u'thumbnail', fatal=False)
afef36c9
AB
890
891 return [{
51d2453c
FV
892 'id': video_id,
893 'url': video_url,
894 'ext': 'mp4',
895 'title': video_title,
afef36c9 896 'description': video_description,
51d2453c 897 'thumbnail': thumbnail,
afef36c9
AB
898 'uploader_id': video_uploader_id,
899 }]
900
45014296
JMF
901class TeamcocoIE(InfoExtractor):
902 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
903
904 def _real_extract(self, url):
905 mobj = re.match(self._VALID_URL, url)
906 if mobj is None:
907 raise ExtractorError(u'Invalid URL: %s' % url)
908 url_title = mobj.group('url_title')
909 webpage = self._download_webpage(url, url_title)
910
979a9dd4 911 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
ac3e9394 912 webpage, u'video id')
45014296
JMF
913
914 self.report_extraction(video_id)
915
979a9dd4 916 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
ac3e9394 917 webpage, u'title')
45014296 918
979a9dd4 919 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
ac3e9394 920 webpage, u'thumbnail', fatal=False)
45014296 921
979a9dd4 922 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
ac3e9394 923 webpage, u'description', fatal=False)
45014296
JMF
924
925 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
926 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
ac3e9394 927
979a9dd4 928 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
ac3e9394 929 data, u'video URL')
45014296
JMF
930
931 return [{
932 'id': video_id,
933 'url': video_url,
934 'ext': 'mp4',
935 'title': video_title,
936 'thumbnail': thumbnail,
ac3e9394 937 'description': video_description,
45014296 938 }]
84095012 939
71e458d4
YUK
940class XHamsterIE(InfoExtractor):
941 """Information Extractor for xHamster"""
942 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
943
944 def _real_extract(self,url):
945 mobj = re.match(self._VALID_URL, url)
946
947 video_id = mobj.group('id')
84095012 948 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
71e458d4 949 webpage = self._download_webpage(mrss_url, video_id)
84095012 950
71e458d4
YUK
951 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
952 if mobj is None:
953 raise ExtractorError(u'Unable to extract media URL')
954 if len(mobj.group('server')) == 0:
955 video_url = compat_urllib_parse.unquote(mobj.group('file'))
956 else:
957 video_url = mobj.group('server')+'/key='+mobj.group('file')
958 video_extension = video_url.split('.')[-1]
959
979a9dd4 960 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
84095012 961 webpage, u'title')
71e458d4 962
8b59a986 963 # Can't see the description anywhere in the UI
979a9dd4 964 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
8b59a986
FV
965 # webpage, u'description', fatal=False)
966 # if video_description: video_description = unescapeHTML(video_description)
71e458d4
YUK
967
968 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
84095012
FV
969 if mobj:
970 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
71e458d4 971 else:
84095012
FV
972 video_upload_date = None
973 self._downloader.report_warning(u'Unable to extract upload date')
71e458d4 974
78d3442b 975 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
84095012
FV
976 webpage, u'uploader id', default=u'anonymous')
977
978 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
979 webpage, u'thumbnail', fatal=False)
71e458d4
YUK
980
981 return [{
982 'id': video_id,
983 'url': video_url,
984 'ext': video_extension,
985 'title': video_title,
8b59a986 986 # 'description': video_description,
71e458d4
YUK
987 'upload_date': video_upload_date,
988 'uploader_id': video_uploader_id,
989 'thumbnail': video_thumbnail
990 }]
afef36c9 991
157b864a
YK
992class HypemIE(InfoExtractor):
993 """Information Extractor for hypem"""
994 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
995
868d62a5 996 def _real_extract(self, url):
157b864a
YK
997 mobj = re.match(self._VALID_URL, url)
998 if mobj is None:
999 raise ExtractorError(u'Invalid URL: %s' % url)
868d62a5
FV
1000 track_id = mobj.group(1)
1001
1002 data = { 'ax': 1, 'ts': time.time() }
157b864a 1003 data_encoded = compat_urllib_parse.urlencode(data)
868d62a5 1004 complete_url = url + "?" + data_encoded
157b864a 1005 request = compat_urllib_request.Request(complete_url)
868d62a5 1006 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
157b864a 1007 cookie = urlh.headers.get('Set-Cookie', '')
868d62a5
FV
1008
1009 self.report_extraction(track_id)
84095012 1010
979a9dd4 1011 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
84095012 1012 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
157b864a
YK
1013 try:
1014 track_list = json.loads(html_tracks)
868d62a5 1015 track = track_list[u'tracks'][0]
157b864a 1016 except ValueError:
868d62a5
FV
1017 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1018
1019 key = track[u"key"]
1020 track_id = track[u"id"]
1021 artist = track[u"artist"]
1022 title = track[u"song"]
1023
1024 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
157b864a
YK
1025 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1026 request.add_header('cookie', cookie)
868d62a5
FV
1027 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1028 try:
1029 song_data = json.loads(song_data_json)
1030 except ValueError:
1031 raise ExtractorError(u'Hypemachine contained invalid JSON.')
157b864a 1032 final_url = song_data[u"url"]
868d62a5 1033
157b864a 1034 return [{
868d62a5 1035 'id': track_id,
157b864a
YK
1036 'url': final_url,
1037 'ext': "mp3",
1038 'title': title,
1039 'artist': artist,
1040 }]
1041
ecb3e676
YK
1042class Vbox7IE(InfoExtractor):
1043 """Information Extractor for Vbox7"""
1044 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1045
1046 def _real_extract(self,url):
1047 mobj = re.match(self._VALID_URL, url)
1048 if mobj is None:
1049 raise ExtractorError(u'Invalid URL: %s' % url)
1050 video_id = mobj.group(1)
1051
1052 redirect_page, urlh = self._download_webpage_handle(url, video_id)
0251f9c9
FV
1053 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1054 redirect_url = urlh.geturl() + new_location
ecb3e676
YK
1055 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1056
0251f9c9
FV
1057 title = self._html_search_regex(r'<title>(.*)</title>',
1058 webpage, u'title').split('/')[0].strip()
ecb3e676
YK
1059
1060 ext = "flv"
1061 info_url = "http://vbox7.com/play/magare.do"
1062 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1063 info_request = compat_urllib_request.Request(info_url, data)
1064 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1065 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1066 if info_response is None:
1067 raise ExtractorError(u'Unable to extract the media url')
cd453d38 1068 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
ecb3e676
YK
1069
1070 return [{
cd453d38
JMF
1071 'id': video_id,
1072 'url': final_url,
1073 'ext': ext,
1074 'title': title,
1075 'thumbnail': thumbnail_url,
ecb3e676 1076 }]
157b864a 1077
32aa88bc 1078
4aeae91f
PH
1079def gen_extractors():
1080 """ Return a list of an instance of every supported extractor.
1081 The order does matter; the first extractor matched is the one handling the URL.
1082 """
1083 return [
1084 YoutubePlaylistIE(),
1085 YoutubeChannelIE(),
1086 YoutubeUserIE(),
1087 YoutubeSearchIE(),
1088 YoutubeIE(),
1089 MetacafeIE(),
1090 DailymotionIE(),
1091 GoogleSearchIE(),
1092 PhotobucketIE(),
1093 YahooIE(),
1094 YahooSearchIE(),
1095 DepositFilesIE(),
1096 FacebookIE(),
4aeae91f 1097 BlipTVIE(),
1b2b22ed 1098 BlipTVUserIE(),
4aeae91f
PH
1099 VimeoIE(),
1100 MyVideoIE(),
1101 ComedyCentralIE(),
1102 EscapistIE(),
1103 CollegeHumorIE(),
1104 XVideosIE(),
5011cded 1105 SoundcloudSetIE(),
4aeae91f
PH
1106 SoundcloudIE(),
1107 InfoQIE(),
1108 MixcloudIE(),
1109 StanfordOpenClassroomIE(),
1110 MTVIE(),
1111 YoukuIE(),
1112 XNXXIE(),
18be482a
JC
1113 YouJizzIE(),
1114 PornotubeIE(),
1115 YouPornIE(),
4aeae91f
PH
1116 GooglePlusIE(),
1117 ArteTvIE(),
1118 NBAIE(),
40634747 1119 WorldStarHipHopIE(),
4aeae91f
PH
1120 JustinTVIE(),
1121 FunnyOrDieIE(),
4aeae91f
PH
1122 SteamIE(),
1123 UstreamIE(),
ca0a0bbe 1124 RBMARadioIE(),
ccf65f9d 1125 EightTracksIE(),
da06e2da 1126 KeekIE(),
3a468f2d 1127 TEDIE(),
58994225 1128 MySpassIE(),
e32b06e9 1129 SpiegelIE(),
0cd35867 1130 LiveLeakIE(),
df2dedee 1131 ARDIE(),
f2cd958c 1132 ZDFIE(),
c15e0241 1133 TumblrIE(),
aed523ec 1134 BandcampIE(),
c34407d1 1135 RedTubeIE(),
dfb9323c 1136 InaIE(),
d4f76f16 1137 HowcastIE(),
5b0d3cc0 1138 VineIE(),
afef36c9 1139 FlickrIE(),
45014296 1140 TeamcocoIE(),
71e458d4 1141 XHamsterIE(),
157b864a 1142 HypemIE(),
ecb3e676 1143 Vbox7IE(),
32aa88bc 1144 GametrailersIE(),
6b3f5a32 1145 StatigramIE(),
4aeae91f
PH
1146 GenericIE()
1147 ]
93412126
JMF
1148
1149def get_info_extractor(ie_name):
1150 """Returns the info extractor class with the given ie_name"""
1151 return globals()[ie_name+'IE']