]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Move XVideos IE into its own file (and simplify it a bit)
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.metacafe import MetacafeIE
35 from .extractor.myvideo import MyVideoIE
36 from .extractor.statigram import StatigramIE
37 from .extractor.photobucket import PhotobucketIE
38 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
39 from .extractor.vimeo import VimeoIE
40 from .extractor.xvideos import XVideosIE
41 from .extractor.yahoo import YahooIE, YahooSearchIE
42 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
43 from .extractor.zdf import ZDFIE
44
45
46
47 class InfoQIE(InfoExtractor):
48 """Information extractor for infoq.com"""
49 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
50
51 def _real_extract(self, url):
52 mobj = re.match(self._VALID_URL, url)
53 if mobj is None:
54 raise ExtractorError(u'Invalid URL: %s' % url)
55
56 webpage = self._download_webpage(url, video_id=url)
57 self.report_extraction(url)
58
59 # Extract video URL
60 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
61 if mobj is None:
62 raise ExtractorError(u'Unable to extract video url')
63 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
64 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
65
66 # Extract title
67 video_title = self._search_regex(r'contentTitle = "(.*?)";',
68 webpage, u'title')
69
70 # Extract description
71 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
72 webpage, u'description', fatal=False)
73
74 video_filename = video_url.split('/')[-1]
75 video_id, extension = video_filename.split('.')
76
77 info = {
78 'id': video_id,
79 'url': video_url,
80 'uploader': None,
81 'upload_date': None,
82 'title': video_title,
83 'ext': extension, # Extension is always(?) mp4, but seems to be flv
84 'thumbnail': None,
85 'description': video_description,
86 }
87
88 return [info]
89
90 class MixcloudIE(InfoExtractor):
91 """Information extractor for www.mixcloud.com"""
92
93 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
94 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
95 IE_NAME = u'mixcloud'
96
97 def report_download_json(self, file_id):
98 """Report JSON download."""
99 self.to_screen(u'Downloading json')
100
101 def get_urls(self, jsonData, fmt, bitrate='best'):
102 """Get urls from 'audio_formats' section in json"""
103 file_url = None
104 try:
105 bitrate_list = jsonData[fmt]
106 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
107 bitrate = max(bitrate_list) # select highest
108
109 url_list = jsonData[fmt][bitrate]
110 except TypeError: # we have no bitrate info.
111 url_list = jsonData[fmt]
112 return url_list
113
114 def check_urls(self, url_list):
115 """Returns 1st active url from list"""
116 for url in url_list:
117 try:
118 compat_urllib_request.urlopen(url)
119 return url
120 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
121 url = None
122
123 return None
124
125 def _print_formats(self, formats):
126 print('Available formats:')
127 for fmt in formats.keys():
128 for b in formats[fmt]:
129 try:
130 ext = formats[fmt][b][0]
131 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
132 except TypeError: # we have no bitrate info
133 ext = formats[fmt][0]
134 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
135 break
136
137 def _real_extract(self, url):
138 mobj = re.match(self._VALID_URL, url)
139 if mobj is None:
140 raise ExtractorError(u'Invalid URL: %s' % url)
141 # extract uploader & filename from url
142 uploader = mobj.group(1).decode('utf-8')
143 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
144
145 # construct API request
146 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
147 # retrieve .json file with links to files
148 request = compat_urllib_request.Request(file_url)
149 try:
150 self.report_download_json(file_url)
151 jsonData = compat_urllib_request.urlopen(request).read()
152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
153 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
154
155 # parse JSON
156 json_data = json.loads(jsonData)
157 player_url = json_data['player_swf_url']
158 formats = dict(json_data['audio_formats'])
159
160 req_format = self._downloader.params.get('format', None)
161 bitrate = None
162
163 if self._downloader.params.get('listformats', None):
164 self._print_formats(formats)
165 return
166
167 if req_format is None or req_format == 'best':
168 for format_param in formats.keys():
169 url_list = self.get_urls(formats, format_param)
170 # check urls
171 file_url = self.check_urls(url_list)
172 if file_url is not None:
173 break # got it!
174 else:
175 if req_format not in formats:
176 raise ExtractorError(u'Format is not available')
177
178 url_list = self.get_urls(formats, req_format)
179 file_url = self.check_urls(url_list)
180 format_param = req_format
181
182 return [{
183 'id': file_id.decode('utf-8'),
184 'url': file_url.decode('utf-8'),
185 'uploader': uploader.decode('utf-8'),
186 'upload_date': None,
187 'title': json_data['name'],
188 'ext': file_url.split('.')[-1].decode('utf-8'),
189 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
190 'thumbnail': json_data['thumbnail_url'],
191 'description': json_data['description'],
192 'player_url': player_url.decode('utf-8'),
193 }]
194
195 class StanfordOpenClassroomIE(InfoExtractor):
196 """Information extractor for Stanford's Open ClassRoom"""
197
198 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
199 IE_NAME = u'stanfordoc'
200
201 def _real_extract(self, url):
202 mobj = re.match(self._VALID_URL, url)
203 if mobj is None:
204 raise ExtractorError(u'Invalid URL: %s' % url)
205
206 if mobj.group('course') and mobj.group('video'): # A specific video
207 course = mobj.group('course')
208 video = mobj.group('video')
209 info = {
210 'id': course + '_' + video,
211 'uploader': None,
212 'upload_date': None,
213 }
214
215 self.report_extraction(info['id'])
216 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
217 xmlUrl = baseUrl + video + '.xml'
218 try:
219 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
220 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
221 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
222 mdoc = xml.etree.ElementTree.fromstring(metaXml)
223 try:
224 info['title'] = mdoc.findall('./title')[0].text
225 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
226 except IndexError:
227 raise ExtractorError(u'Invalid metadata XML file')
228 info['ext'] = info['url'].rpartition('.')[2]
229 return [info]
230 elif mobj.group('course'): # A course page
231 course = mobj.group('course')
232 info = {
233 'id': course,
234 'type': 'playlist',
235 'uploader': None,
236 'upload_date': None,
237 }
238
239 coursepage = self._download_webpage(url, info['id'],
240 note='Downloading course info page',
241 errnote='Unable to download course info page')
242
243 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
244
245 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
246 coursepage, u'description', fatal=False)
247
248 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
249 info['list'] = [
250 {
251 'type': 'reference',
252 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
253 }
254 for vpage in links]
255 results = []
256 for entry in info['list']:
257 assert entry['type'] == 'reference'
258 results += self.extract(entry['url'])
259 return results
260 else: # Root page
261 info = {
262 'id': 'Stanford OpenClassroom',
263 'type': 'playlist',
264 'uploader': None,
265 'upload_date': None,
266 }
267
268 self.report_download_webpage(info['id'])
269 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
270 try:
271 rootpage = compat_urllib_request.urlopen(rootURL).read()
272 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
273 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
274
275 info['title'] = info['id']
276
277 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
278 info['list'] = [
279 {
280 'type': 'reference',
281 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
282 }
283 for cpage in links]
284
285 results = []
286 for entry in info['list']:
287 assert entry['type'] == 'reference'
288 results += self.extract(entry['url'])
289 return results
290
291 class MTVIE(InfoExtractor):
292 """Information extractor for MTV.com"""
293
294 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
295 IE_NAME = u'mtv'
296
297 def _real_extract(self, url):
298 mobj = re.match(self._VALID_URL, url)
299 if mobj is None:
300 raise ExtractorError(u'Invalid URL: %s' % url)
301 if not mobj.group('proto'):
302 url = 'http://' + url
303 video_id = mobj.group('videoid')
304
305 webpage = self._download_webpage(url, video_id)
306
307 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
308 webpage, u'song name', fatal=False)
309
310 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
311 webpage, u'title')
312
313 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
314 webpage, u'mtvn_uri', fatal=False)
315
316 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
317 webpage, u'content id', fatal=False)
318
319 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
320 self.report_extraction(video_id)
321 request = compat_urllib_request.Request(videogen_url)
322 try:
323 metadataXml = compat_urllib_request.urlopen(request).read()
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
326
327 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
328 renditions = mdoc.findall('.//rendition')
329
330 # For now, always pick the highest quality.
331 rendition = renditions[-1]
332
333 try:
334 _,_,ext = rendition.attrib['type'].partition('/')
335 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
336 video_url = rendition.find('./src').text
337 except KeyError:
338 raise ExtractorError('Invalid rendition field.')
339
340 info = {
341 'id': video_id,
342 'url': video_url,
343 'uploader': performer,
344 'upload_date': None,
345 'title': video_title,
346 'ext': ext,
347 'format': format,
348 }
349
350 return [info]
351
352
353 class YoukuIE(InfoExtractor):
354 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
355
356 def _gen_sid(self):
357 nowTime = int(time.time() * 1000)
358 random1 = random.randint(1000,1998)
359 random2 = random.randint(1000,9999)
360
361 return "%d%d%d" %(nowTime,random1,random2)
362
363 def _get_file_ID_mix_string(self, seed):
364 mixed = []
365 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
366 seed = float(seed)
367 for i in range(len(source)):
368 seed = (seed * 211 + 30031 ) % 65536
369 index = math.floor(seed / 65536 * len(source) )
370 mixed.append(source[int(index)])
371 source.remove(source[int(index)])
372 #return ''.join(mixed)
373 return mixed
374
375 def _get_file_id(self, fileId, seed):
376 mixed = self._get_file_ID_mix_string(seed)
377 ids = fileId.split('*')
378 realId = []
379 for ch in ids:
380 if ch:
381 realId.append(mixed[int(ch)])
382 return ''.join(realId)
383
384 def _real_extract(self, url):
385 mobj = re.match(self._VALID_URL, url)
386 if mobj is None:
387 raise ExtractorError(u'Invalid URL: %s' % url)
388 video_id = mobj.group('ID')
389
390 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
391
392 jsondata = self._download_webpage(info_url, video_id)
393
394 self.report_extraction(video_id)
395 try:
396 config = json.loads(jsondata)
397
398 video_title = config['data'][0]['title']
399 seed = config['data'][0]['seed']
400
401 format = self._downloader.params.get('format', None)
402 supported_format = list(config['data'][0]['streamfileids'].keys())
403
404 if format is None or format == 'best':
405 if 'hd2' in supported_format:
406 format = 'hd2'
407 else:
408 format = 'flv'
409 ext = u'flv'
410 elif format == 'worst':
411 format = 'mp4'
412 ext = u'mp4'
413 else:
414 format = 'flv'
415 ext = u'flv'
416
417
418 fileid = config['data'][0]['streamfileids'][format]
419 keys = [s['k'] for s in config['data'][0]['segs'][format]]
420 except (UnicodeDecodeError, ValueError, KeyError):
421 raise ExtractorError(u'Unable to extract info section')
422
423 files_info=[]
424 sid = self._gen_sid()
425 fileid = self._get_file_id(fileid, seed)
426
427 #column 8,9 of fileid represent the segment number
428 #fileid[7:9] should be changed
429 for index, key in enumerate(keys):
430
431 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
432 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
433
434 info = {
435 'id': '%s_part%02d' % (video_id, index),
436 'url': download_url,
437 'uploader': None,
438 'upload_date': None,
439 'title': video_title,
440 'ext': ext,
441 }
442 files_info.append(info)
443
444 return files_info
445
446
447 class XNXXIE(InfoExtractor):
448 """Information extractor for xnxx.com"""
449
450 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
451 IE_NAME = u'xnxx'
452 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
453 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
454 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
455
456 def _real_extract(self, url):
457 mobj = re.match(self._VALID_URL, url)
458 if mobj is None:
459 raise ExtractorError(u'Invalid URL: %s' % url)
460 video_id = mobj.group(1)
461
462 # Get webpage content
463 webpage = self._download_webpage(url, video_id)
464
465 video_url = self._search_regex(self.VIDEO_URL_RE,
466 webpage, u'video URL')
467 video_url = compat_urllib_parse.unquote(video_url)
468
469 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
470 webpage, u'title')
471
472 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
473 webpage, u'thumbnail', fatal=False)
474
475 return [{
476 'id': video_id,
477 'url': video_url,
478 'uploader': None,
479 'upload_date': None,
480 'title': video_title,
481 'ext': 'flv',
482 'thumbnail': video_thumbnail,
483 'description': None,
484 }]
485
486
487
488 class NBAIE(InfoExtractor):
489 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
490 IE_NAME = u'nba'
491
492 def _real_extract(self, url):
493 mobj = re.match(self._VALID_URL, url)
494 if mobj is None:
495 raise ExtractorError(u'Invalid URL: %s' % url)
496
497 video_id = mobj.group(1)
498
499 webpage = self._download_webpage(url, video_id)
500
501 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
502
503 shortened_video_id = video_id.rpartition('/')[2]
504 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
505 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
506
507 # It isn't there in the HTML it returns to us
508 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
509
510 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
511
512 info = {
513 'id': shortened_video_id,
514 'url': video_url,
515 'ext': 'mp4',
516 'title': title,
517 # 'uploader_date': uploader_date,
518 'description': description,
519 }
520 return [info]
521
522 class JustinTVIE(InfoExtractor):
523 """Information extractor for justin.tv and twitch.tv"""
524 # TODO: One broadcast may be split into multiple videos. The key
525 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
526 # starts at 1 and increases. Can we treat all parts as one video?
527
528 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
529 (?:
530 (?P<channelid>[^/]+)|
531 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
532 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
533 )
534 /?(?:\#.*)?$
535 """
536 _JUSTIN_PAGE_LIMIT = 100
537 IE_NAME = u'justin.tv'
538
539 def report_download_page(self, channel, offset):
540 """Report attempt to download a single page of videos."""
541 self.to_screen(u'%s: Downloading video information from %d to %d' %
542 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
543
544 # Return count of items, list of *valid* items
545 def _parse_page(self, url, video_id):
546 webpage = self._download_webpage(url, video_id,
547 u'Downloading video info JSON',
548 u'unable to download video info JSON')
549
550 response = json.loads(webpage)
551 if type(response) != list:
552 error_text = response.get('error', 'unknown error')
553 raise ExtractorError(u'Justin.tv API: %s' % error_text)
554 info = []
555 for clip in response:
556 video_url = clip['video_file_url']
557 if video_url:
558 video_extension = os.path.splitext(video_url)[1][1:]
559 video_date = re.sub('-', '', clip['start_time'][:10])
560 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
561 video_id = clip['id']
562 video_title = clip.get('title', video_id)
563 info.append({
564 'id': video_id,
565 'url': video_url,
566 'title': video_title,
567 'uploader': clip.get('channel_name', video_uploader_id),
568 'uploader_id': video_uploader_id,
569 'upload_date': video_date,
570 'ext': video_extension,
571 })
572 return (len(response), info)
573
574 def _real_extract(self, url):
575 mobj = re.match(self._VALID_URL, url)
576 if mobj is None:
577 raise ExtractorError(u'invalid URL: %s' % url)
578
579 api_base = 'http://api.justin.tv'
580 paged = False
581 if mobj.group('channelid'):
582 paged = True
583 video_id = mobj.group('channelid')
584 api = api_base + '/channel/archives/%s.json' % video_id
585 elif mobj.group('chapterid'):
586 chapter_id = mobj.group('chapterid')
587
588 webpage = self._download_webpage(url, chapter_id)
589 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
590 if not m:
591 raise ExtractorError(u'Cannot find archive of a chapter')
592 archive_id = m.group(1)
593
594 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
595 chapter_info_xml = self._download_webpage(api, chapter_id,
596 note=u'Downloading chapter information',
597 errnote=u'Chapter information download failed')
598 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
599 for a in doc.findall('.//archive'):
600 if archive_id == a.find('./id').text:
601 break
602 else:
603 raise ExtractorError(u'Could not find chapter in chapter information')
604
605 video_url = a.find('./video_file_url').text
606 video_ext = video_url.rpartition('.')[2] or u'flv'
607
608 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
609 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
610 note='Downloading chapter metadata',
611 errnote='Download of chapter metadata failed')
612 chapter_info = json.loads(chapter_info_json)
613
614 bracket_start = int(doc.find('.//bracket_start').text)
615 bracket_end = int(doc.find('.//bracket_end').text)
616
617 # TODO determine start (and probably fix up file)
618 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
619 #video_url += u'?start=' + TODO:start_timestamp
620 # bracket_start is 13290, but we want 51670615
621 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
622 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
623
624 info = {
625 'id': u'c' + chapter_id,
626 'url': video_url,
627 'ext': video_ext,
628 'title': chapter_info['title'],
629 'thumbnail': chapter_info['preview'],
630 'description': chapter_info['description'],
631 'uploader': chapter_info['channel']['display_name'],
632 'uploader_id': chapter_info['channel']['name'],
633 }
634 return [info]
635 else:
636 video_id = mobj.group('videoid')
637 api = api_base + '/broadcast/by_archive/%s.json' % video_id
638
639 self.report_extraction(video_id)
640
641 info = []
642 offset = 0
643 limit = self._JUSTIN_PAGE_LIMIT
644 while True:
645 if paged:
646 self.report_download_page(video_id, offset)
647 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
648 page_count, page_info = self._parse_page(page_url, video_id)
649 info.extend(page_info)
650 if not paged or page_count != limit:
651 break
652 offset += limit
653 return info
654
655 class FunnyOrDieIE(InfoExtractor):
656 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
657
658 def _real_extract(self, url):
659 mobj = re.match(self._VALID_URL, url)
660 if mobj is None:
661 raise ExtractorError(u'invalid URL: %s' % url)
662
663 video_id = mobj.group('id')
664 webpage = self._download_webpage(url, video_id)
665
666 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
667 webpage, u'video URL', flags=re.DOTALL)
668
669 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
670 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
671
672 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
673 webpage, u'description', fatal=False, flags=re.DOTALL)
674
675 info = {
676 'id': video_id,
677 'url': video_url,
678 'ext': 'mp4',
679 'title': title,
680 'description': video_description,
681 }
682 return [info]
683
684 class SteamIE(InfoExtractor):
685 _VALID_URL = r"""http://store\.steampowered\.com/
686 (agecheck/)?
687 (?P<urltype>video|app)/ #If the page is only for videos or for a game
688 (?P<gameID>\d+)/?
689 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
690 """
691 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
692 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
693
694 @classmethod
695 def suitable(cls, url):
696 """Receives a URL and returns True if suitable for this IE."""
697 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
698
699 def _real_extract(self, url):
700 m = re.match(self._VALID_URL, url, re.VERBOSE)
701 gameID = m.group('gameID')
702
703 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
704 webpage = self._download_webpage(videourl, gameID)
705
706 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
707 videourl = self._AGECHECK_TEMPLATE % gameID
708 self.report_age_confirmation()
709 webpage = self._download_webpage(videourl, gameID)
710
711 self.report_extraction(gameID)
712 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
713 webpage, 'game title')
714
715 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
716 mweb = re.finditer(urlRE, webpage)
717 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
718 titles = re.finditer(namesRE, webpage)
719 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
720 thumbs = re.finditer(thumbsRE, webpage)
721 videos = []
722 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
723 video_id = vid.group('videoID')
724 title = vtitle.group('videoName')
725 video_url = vid.group('videoURL')
726 video_thumb = thumb.group('thumbnail')
727 if not video_url:
728 raise ExtractorError(u'Cannot find video url for %s' % video_id)
729 info = {
730 'id':video_id,
731 'url':video_url,
732 'ext': 'flv',
733 'title': unescapeHTML(title),
734 'thumbnail': video_thumb
735 }
736 videos.append(info)
737 return [self.playlist_result(videos, gameID, game_title)]
738
739 class UstreamIE(InfoExtractor):
740 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
741 IE_NAME = u'ustream'
742
743 def _real_extract(self, url):
744 m = re.match(self._VALID_URL, url)
745 video_id = m.group('videoID')
746
747 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
748 webpage = self._download_webpage(url, video_id)
749
750 self.report_extraction(video_id)
751
752 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
753 webpage, u'title')
754
755 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
756 webpage, u'uploader', fatal=False, flags=re.DOTALL)
757
758 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
759 webpage, u'thumbnail', fatal=False)
760
761 info = {
762 'id': video_id,
763 'url': video_url,
764 'ext': 'flv',
765 'title': video_title,
766 'uploader': uploader,
767 'thumbnail': thumbnail,
768 }
769 return info
770
771 class WorldStarHipHopIE(InfoExtractor):
772 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
773 IE_NAME = u'WorldStarHipHop'
774
775 def _real_extract(self, url):
776 m = re.match(self._VALID_URL, url)
777 video_id = m.group('id')
778
779 webpage_src = self._download_webpage(url, video_id)
780
781 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
782 webpage_src, u'video URL')
783
784 if 'mp4' in video_url:
785 ext = 'mp4'
786 else:
787 ext = 'flv'
788
789 video_title = self._html_search_regex(r"<title>(.*)</title>",
790 webpage_src, u'title')
791
792 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
793 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
794 webpage_src, u'thumbnail', fatal=False)
795
796 if not thumbnail:
797 _title = r"""candytitles.*>(.*)</span>"""
798 mobj = re.search(_title, webpage_src)
799 if mobj is not None:
800 video_title = mobj.group(1)
801
802 results = [{
803 'id': video_id,
804 'url' : video_url,
805 'title' : video_title,
806 'thumbnail' : thumbnail,
807 'ext' : ext,
808 }]
809 return results
810
811 class RBMARadioIE(InfoExtractor):
812 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
813
814 def _real_extract(self, url):
815 m = re.match(self._VALID_URL, url)
816 video_id = m.group('videoID')
817
818 webpage = self._download_webpage(url, video_id)
819
820 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
821 webpage, u'json data', flags=re.MULTILINE)
822
823 try:
824 data = json.loads(json_data)
825 except ValueError as e:
826 raise ExtractorError(u'Invalid JSON: ' + str(e))
827
828 video_url = data['akamai_url'] + '&cbr=256'
829 url_parts = compat_urllib_parse_urlparse(video_url)
830 video_ext = url_parts.path.rpartition('.')[2]
831 info = {
832 'id': video_id,
833 'url': video_url,
834 'ext': video_ext,
835 'title': data['title'],
836 'description': data.get('teaser_text'),
837 'location': data.get('country_of_origin'),
838 'uploader': data.get('host', {}).get('name'),
839 'uploader_id': data.get('host', {}).get('slug'),
840 'thumbnail': data.get('image', {}).get('large_url_2x'),
841 'duration': data.get('duration'),
842 }
843 return [info]
844
845
846 class YouPornIE(InfoExtractor):
847 """Information extractor for youporn.com."""
848 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
849
850 def _print_formats(self, formats):
851 """Print all available formats"""
852 print(u'Available formats:')
853 print(u'ext\t\tformat')
854 print(u'---------------------------------')
855 for format in formats:
856 print(u'%s\t\t%s' % (format['ext'], format['format']))
857
858 def _specific(self, req_format, formats):
859 for x in formats:
860 if(x["format"]==req_format):
861 return x
862 return None
863
864 def _real_extract(self, url):
865 mobj = re.match(self._VALID_URL, url)
866 if mobj is None:
867 raise ExtractorError(u'Invalid URL: %s' % url)
868 video_id = mobj.group('videoid')
869
870 req = compat_urllib_request.Request(url)
871 req.add_header('Cookie', 'age_verified=1')
872 webpage = self._download_webpage(req, video_id)
873
874 # Get JSON parameters
875 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
876 try:
877 params = json.loads(json_params)
878 except:
879 raise ExtractorError(u'Invalid JSON')
880
881 self.report_extraction(video_id)
882 try:
883 video_title = params['title']
884 upload_date = unified_strdate(params['release_date_f'])
885 video_description = params['description']
886 video_uploader = params['submitted_by']
887 thumbnail = params['thumbnails'][0]['image']
888 except KeyError:
889 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
890
891 # Get all of the formats available
892 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
893 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
894 webpage, u'download list').strip()
895
896 # Get all of the links from the page
897 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
898 links = re.findall(LINK_RE, download_list_html)
899 if(len(links) == 0):
900 raise ExtractorError(u'ERROR: no known formats available for video')
901
902 self.to_screen(u'Links found: %d' % len(links))
903
904 formats = []
905 for link in links:
906
907 # A link looks like this:
908 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
909 # A path looks like this:
910 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
911 video_url = unescapeHTML( link )
912 path = compat_urllib_parse_urlparse( video_url ).path
913 extension = os.path.splitext( path )[1][1:]
914 format = path.split('/')[4].split('_')[:2]
915 size = format[0]
916 bitrate = format[1]
917 format = "-".join( format )
918 # title = u'%s-%s-%s' % (video_title, size, bitrate)
919
920 formats.append({
921 'id': video_id,
922 'url': video_url,
923 'uploader': video_uploader,
924 'upload_date': upload_date,
925 'title': video_title,
926 'ext': extension,
927 'format': format,
928 'thumbnail': thumbnail,
929 'description': video_description
930 })
931
932 if self._downloader.params.get('listformats', None):
933 self._print_formats(formats)
934 return
935
936 req_format = self._downloader.params.get('format', None)
937 self.to_screen(u'Format: %s' % req_format)
938
939 if req_format is None or req_format == 'best':
940 return [formats[0]]
941 elif req_format == 'worst':
942 return [formats[-1]]
943 elif req_format in ('-1', 'all'):
944 return formats
945 else:
946 format = self._specific( req_format, formats )
947 if result is None:
948 raise ExtractorError(u'Requested format not available')
949 return [format]
950
951
952
953 class PornotubeIE(InfoExtractor):
954 """Information extractor for pornotube.com."""
955 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
956
957 def _real_extract(self, url):
958 mobj = re.match(self._VALID_URL, url)
959 if mobj is None:
960 raise ExtractorError(u'Invalid URL: %s' % url)
961
962 video_id = mobj.group('videoid')
963 video_title = mobj.group('title')
964
965 # Get webpage content
966 webpage = self._download_webpage(url, video_id)
967
968 # Get the video URL
969 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
970 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
971 video_url = compat_urllib_parse.unquote(video_url)
972
973 #Get the uploaded date
974 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
975 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
976 if upload_date: upload_date = unified_strdate(upload_date)
977
978 info = {'id': video_id,
979 'url': video_url,
980 'uploader': None,
981 'upload_date': upload_date,
982 'title': video_title,
983 'ext': 'flv',
984 'format': 'flv'}
985
986 return [info]
987
988 class YouJizzIE(InfoExtractor):
989 """Information extractor for youjizz.com."""
990 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991
992 def _real_extract(self, url):
993 mobj = re.match(self._VALID_URL, url)
994 if mobj is None:
995 raise ExtractorError(u'Invalid URL: %s' % url)
996
997 video_id = mobj.group('videoid')
998
999 # Get webpage content
1000 webpage = self._download_webpage(url, video_id)
1001
1002 # Get the video title
1003 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1004 webpage, u'title').strip()
1005
1006 # Get the embed page
1007 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1008 if result is None:
1009 raise ExtractorError(u'ERROR: unable to extract embed page')
1010
1011 embed_page_url = result.group(0).strip()
1012 video_id = result.group('videoid')
1013
1014 webpage = self._download_webpage(embed_page_url, video_id)
1015
1016 # Get the video URL
1017 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1018 webpage, u'video URL')
1019
1020 info = {'id': video_id,
1021 'url': video_url,
1022 'title': video_title,
1023 'ext': 'flv',
1024 'format': 'flv',
1025 'player_url': embed_page_url}
1026
1027 return [info]
1028
1029 class EightTracksIE(InfoExtractor):
1030 IE_NAME = '8tracks'
1031 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1032
1033 def _real_extract(self, url):
1034 mobj = re.match(self._VALID_URL, url)
1035 if mobj is None:
1036 raise ExtractorError(u'Invalid URL: %s' % url)
1037 playlist_id = mobj.group('id')
1038
1039 webpage = self._download_webpage(url, playlist_id)
1040
1041 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1042 data = json.loads(json_like)
1043
1044 session = str(random.randint(0, 1000000000))
1045 mix_id = data['id']
1046 track_count = data['tracks_count']
1047 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1048 next_url = first_url
1049 res = []
1050 for i in itertools.count():
1051 api_json = self._download_webpage(next_url, playlist_id,
1052 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1053 errnote=u'Failed to download song information')
1054 api_data = json.loads(api_json)
1055 track_data = api_data[u'set']['track']
1056 info = {
1057 'id': track_data['id'],
1058 'url': track_data['track_file_stream_url'],
1059 'title': track_data['performer'] + u' - ' + track_data['name'],
1060 'raw_title': track_data['name'],
1061 'uploader_id': data['user']['login'],
1062 'ext': 'm4a',
1063 }
1064 res.append(info)
1065 if api_data['set']['at_last_track']:
1066 break
1067 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1068 return res
1069
1070 class KeekIE(InfoExtractor):
1071 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1072 IE_NAME = u'keek'
1073
1074 def _real_extract(self, url):
1075 m = re.match(self._VALID_URL, url)
1076 video_id = m.group('videoID')
1077
1078 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1079 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1080 webpage = self._download_webpage(url, video_id)
1081
1082 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1083 webpage, u'title')
1084
1085 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1086 webpage, u'uploader', fatal=False)
1087
1088 info = {
1089 'id': video_id,
1090 'url': video_url,
1091 'ext': 'mp4',
1092 'title': video_title,
1093 'thumbnail': thumbnail,
1094 'uploader': uploader
1095 }
1096 return [info]
1097
1098 class TEDIE(InfoExtractor):
1099 _VALID_URL=r'''http://www\.ted\.com/
1100 (
1101 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1102 |
1103 ((?P<type_talk>talks)) # We have a simple talk
1104 )
1105 (/lang/(.*?))? # The url may contain the language
1106 /(?P<name>\w+) # Here goes the name and then ".html"
1107 '''
1108
1109 @classmethod
1110 def suitable(cls, url):
1111 """Receives a URL and returns True if suitable for this IE."""
1112 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1113
1114 def _real_extract(self, url):
1115 m=re.match(self._VALID_URL, url, re.VERBOSE)
1116 if m.group('type_talk'):
1117 return [self._talk_info(url)]
1118 else :
1119 playlist_id=m.group('playlist_id')
1120 name=m.group('name')
1121 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1122 return [self._playlist_videos_info(url,name,playlist_id)]
1123
1124 def _playlist_videos_info(self,url,name,playlist_id=0):
1125 '''Returns the videos of the playlist'''
1126 video_RE=r'''
1127 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1128 ([.\s]*?)data-playlist_item_id="(\d+)"
1129 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1130 '''
1131 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1132 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1133 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1134 m_names=re.finditer(video_name_RE,webpage)
1135
1136 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1137 webpage, 'playlist title')
1138
1139 playlist_entries = []
1140 for m_video, m_name in zip(m_videos,m_names):
1141 video_id=m_video.group('video_id')
1142 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1143 playlist_entries.append(self.url_result(talk_url, 'TED'))
1144 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1145
1146 def _talk_info(self, url, video_id=0):
1147 """Return the video for the talk in the url"""
1148 m = re.match(self._VALID_URL, url,re.VERBOSE)
1149 video_name = m.group('name')
1150 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1151 self.report_extraction(video_name)
1152 # If the url includes the language we get the title translated
1153 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1154 webpage, 'title')
1155 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1156 webpage, 'json data')
1157 info = json.loads(json_data)
1158 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1159 webpage, 'description', flags = re.DOTALL)
1160
1161 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1162 webpage, 'thumbnail')
1163 info = {
1164 'id': info['id'],
1165 'url': info['htmlStreams'][-1]['file'],
1166 'ext': 'mp4',
1167 'title': title,
1168 'thumbnail': thumbnail,
1169 'description': desc,
1170 }
1171 return info
1172
1173 class MySpassIE(InfoExtractor):
1174 _VALID_URL = r'http://www.myspass.de/.*'
1175
1176 def _real_extract(self, url):
1177 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1178
1179 # video id is the last path element of the URL
1180 # usually there is a trailing slash, so also try the second but last
1181 url_path = compat_urllib_parse_urlparse(url).path
1182 url_parent_path, video_id = os.path.split(url_path)
1183 if not video_id:
1184 _, video_id = os.path.split(url_parent_path)
1185
1186 # get metadata
1187 metadata_url = META_DATA_URL_TEMPLATE % video_id
1188 metadata_text = self._download_webpage(metadata_url, video_id)
1189 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1190
1191 # extract values from metadata
1192 url_flv_el = metadata.find('url_flv')
1193 if url_flv_el is None:
1194 raise ExtractorError(u'Unable to extract download url')
1195 video_url = url_flv_el.text
1196 extension = os.path.splitext(video_url)[1][1:]
1197 title_el = metadata.find('title')
1198 if title_el is None:
1199 raise ExtractorError(u'Unable to extract title')
1200 title = title_el.text
1201 format_id_el = metadata.find('format_id')
1202 if format_id_el is None:
1203 format = ext
1204 else:
1205 format = format_id_el.text
1206 description_el = metadata.find('description')
1207 if description_el is not None:
1208 description = description_el.text
1209 else:
1210 description = None
1211 imagePreview_el = metadata.find('imagePreview')
1212 if imagePreview_el is not None:
1213 thumbnail = imagePreview_el.text
1214 else:
1215 thumbnail = None
1216 info = {
1217 'id': video_id,
1218 'url': video_url,
1219 'title': title,
1220 'ext': extension,
1221 'format': format,
1222 'thumbnail': thumbnail,
1223 'description': description
1224 }
1225 return [info]
1226
1227 class SpiegelIE(InfoExtractor):
1228 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1229
1230 def _real_extract(self, url):
1231 m = re.match(self._VALID_URL, url)
1232 video_id = m.group('videoID')
1233
1234 webpage = self._download_webpage(url, video_id)
1235
1236 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1237 webpage, u'title')
1238
1239 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1240 xml_code = self._download_webpage(xml_url, video_id,
1241 note=u'Downloading XML', errnote=u'Failed to download XML')
1242
1243 idoc = xml.etree.ElementTree.fromstring(xml_code)
1244 last_type = idoc[-1]
1245 filename = last_type.findall('./filename')[0].text
1246 duration = float(last_type.findall('./duration')[0].text)
1247
1248 video_url = 'http://video2.spiegel.de/flash/' + filename
1249 video_ext = filename.rpartition('.')[2]
1250 info = {
1251 'id': video_id,
1252 'url': video_url,
1253 'ext': video_ext,
1254 'title': video_title,
1255 'duration': duration,
1256 }
1257 return [info]
1258
1259 class LiveLeakIE(InfoExtractor):
1260
1261 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1262 IE_NAME = u'liveleak'
1263
1264 def _real_extract(self, url):
1265 mobj = re.match(self._VALID_URL, url)
1266 if mobj is None:
1267 raise ExtractorError(u'Invalid URL: %s' % url)
1268
1269 video_id = mobj.group('video_id')
1270
1271 webpage = self._download_webpage(url, video_id)
1272
1273 video_url = self._search_regex(r'file: "(.*?)",',
1274 webpage, u'video URL')
1275
1276 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1277 webpage, u'title').replace('LiveLeak.com -', '').strip()
1278
1279 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1280 webpage, u'description', fatal=False)
1281
1282 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1283 webpage, u'uploader', fatal=False)
1284
1285 info = {
1286 'id': video_id,
1287 'url': video_url,
1288 'ext': 'mp4',
1289 'title': video_title,
1290 'description': video_description,
1291 'uploader': video_uploader
1292 }
1293
1294 return [info]
1295
1296
1297
1298 class TumblrIE(InfoExtractor):
1299 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1300
1301 def _real_extract(self, url):
1302 m_url = re.match(self._VALID_URL, url)
1303 video_id = m_url.group('id')
1304 blog = m_url.group('blog_name')
1305
1306 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1307 webpage = self._download_webpage(url, video_id)
1308
1309 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1310 video = re.search(re_video, webpage)
1311 if video is None:
1312 raise ExtractorError(u'Unable to extract video')
1313 video_url = video.group('video_url')
1314 ext = video.group('ext')
1315
1316 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1317 webpage, u'thumbnail', fatal=False) # We pick the first poster
1318 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1319
1320 # The only place where you can get a title, it's not complete,
1321 # but searching in other places doesn't work for all videos
1322 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1323 webpage, u'title', flags=re.DOTALL)
1324
1325 return [{'id': video_id,
1326 'url': video_url,
1327 'title': video_title,
1328 'thumbnail': video_thumbnail,
1329 'ext': ext
1330 }]
1331
1332 class BandcampIE(InfoExtractor):
1333 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1334
1335 def _real_extract(self, url):
1336 mobj = re.match(self._VALID_URL, url)
1337 title = mobj.group('title')
1338 webpage = self._download_webpage(url, title)
1339 # We get the link to the free download page
1340 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1341 if m_download is None:
1342 raise ExtractorError(u'No free songs found')
1343
1344 download_link = m_download.group(1)
1345 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1346 webpage, re.MULTILINE|re.DOTALL).group('id')
1347
1348 download_webpage = self._download_webpage(download_link, id,
1349 'Downloading free downloads page')
1350 # We get the dictionary of the track from some javascrip code
1351 info = re.search(r'items: (.*?),$',
1352 download_webpage, re.MULTILINE).group(1)
1353 info = json.loads(info)[0]
1354 # We pick mp3-320 for now, until format selection can be easily implemented.
1355 mp3_info = info[u'downloads'][u'mp3-320']
1356 # If we try to use this url it says the link has expired
1357 initial_url = mp3_info[u'url']
1358 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1359 m_url = re.match(re_url, initial_url)
1360 #We build the url we will use to get the final track url
1361 # This url is build in Bandcamp in the script download_bunde_*.js
1362 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1363 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1364 # If we could correctly generate the .rand field the url would be
1365 #in the "download_url" key
1366 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1367
1368 track_info = {'id':id,
1369 'title' : info[u'title'],
1370 'ext' : 'mp3',
1371 'url' : final_url,
1372 'thumbnail' : info[u'thumb_url'],
1373 'uploader' : info[u'artist']
1374 }
1375
1376 return [track_info]
1377
1378 class RedTubeIE(InfoExtractor):
1379 """Information Extractor for redtube"""
1380 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1381
1382 def _real_extract(self,url):
1383 mobj = re.match(self._VALID_URL, url)
1384 if mobj is None:
1385 raise ExtractorError(u'Invalid URL: %s' % url)
1386
1387 video_id = mobj.group('id')
1388 video_extension = 'mp4'
1389 webpage = self._download_webpage(url, video_id)
1390
1391 self.report_extraction(video_id)
1392
1393 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1394 webpage, u'video URL')
1395
1396 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1397 webpage, u'title')
1398
1399 return [{
1400 'id': video_id,
1401 'url': video_url,
1402 'ext': video_extension,
1403 'title': video_title,
1404 }]
1405
1406 class InaIE(InfoExtractor):
1407 """Information Extractor for Ina.fr"""
1408 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1409
1410 def _real_extract(self,url):
1411 mobj = re.match(self._VALID_URL, url)
1412
1413 video_id = mobj.group('id')
1414 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1415 video_extension = 'mp4'
1416 webpage = self._download_webpage(mrss_url, video_id)
1417
1418 self.report_extraction(video_id)
1419
1420 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1421 webpage, u'video URL')
1422
1423 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1424 webpage, u'title')
1425
1426 return [{
1427 'id': video_id,
1428 'url': video_url,
1429 'ext': video_extension,
1430 'title': video_title,
1431 }]
1432
1433 class HowcastIE(InfoExtractor):
1434 """Information Extractor for Howcast.com"""
1435 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1436
1437 def _real_extract(self, url):
1438 mobj = re.match(self._VALID_URL, url)
1439
1440 video_id = mobj.group('id')
1441 webpage_url = 'http://www.howcast.com/videos/' + video_id
1442 webpage = self._download_webpage(webpage_url, video_id)
1443
1444 self.report_extraction(video_id)
1445
1446 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1447 webpage, u'video URL')
1448
1449 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1450 webpage, u'title')
1451
1452 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1453 webpage, u'description', fatal=False)
1454
1455 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1456 webpage, u'thumbnail', fatal=False)
1457
1458 return [{
1459 'id': video_id,
1460 'url': video_url,
1461 'ext': 'mp4',
1462 'title': video_title,
1463 'description': video_description,
1464 'thumbnail': thumbnail,
1465 }]
1466
1467 class VineIE(InfoExtractor):
1468 """Information Extractor for Vine.co"""
1469 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1470
1471 def _real_extract(self, url):
1472 mobj = re.match(self._VALID_URL, url)
1473
1474 video_id = mobj.group('id')
1475 webpage_url = 'https://vine.co/v/' + video_id
1476 webpage = self._download_webpage(webpage_url, video_id)
1477
1478 self.report_extraction(video_id)
1479
1480 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1481 webpage, u'video URL')
1482
1483 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1484 webpage, u'title')
1485
1486 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1487 webpage, u'thumbnail', fatal=False)
1488
1489 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1490 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1491
1492 return [{
1493 'id': video_id,
1494 'url': video_url,
1495 'ext': 'mp4',
1496 'title': video_title,
1497 'thumbnail': thumbnail,
1498 'uploader': uploader,
1499 }]
1500
1501 class FlickrIE(InfoExtractor):
1502 """Information Extractor for Flickr videos"""
1503 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1504
1505 def _real_extract(self, url):
1506 mobj = re.match(self._VALID_URL, url)
1507
1508 video_id = mobj.group('id')
1509 video_uploader_id = mobj.group('uploader_id')
1510 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1511 webpage = self._download_webpage(webpage_url, video_id)
1512
1513 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1514
1515 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1516 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1517
1518 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1519 first_xml, u'node_id')
1520
1521 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1522 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1523
1524 self.report_extraction(video_id)
1525
1526 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1527 if mobj is None:
1528 raise ExtractorError(u'Unable to extract video url')
1529 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1530
1531 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1532 webpage, u'video title')
1533
1534 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1535 webpage, u'description', fatal=False)
1536
1537 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1538 webpage, u'thumbnail', fatal=False)
1539
1540 return [{
1541 'id': video_id,
1542 'url': video_url,
1543 'ext': 'mp4',
1544 'title': video_title,
1545 'description': video_description,
1546 'thumbnail': thumbnail,
1547 'uploader_id': video_uploader_id,
1548 }]
1549
1550 class TeamcocoIE(InfoExtractor):
1551 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1552
1553 def _real_extract(self, url):
1554 mobj = re.match(self._VALID_URL, url)
1555 if mobj is None:
1556 raise ExtractorError(u'Invalid URL: %s' % url)
1557 url_title = mobj.group('url_title')
1558 webpage = self._download_webpage(url, url_title)
1559
1560 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1561 webpage, u'video id')
1562
1563 self.report_extraction(video_id)
1564
1565 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1566 webpage, u'title')
1567
1568 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1569 webpage, u'thumbnail', fatal=False)
1570
1571 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1572 webpage, u'description', fatal=False)
1573
1574 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1575 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1576
1577 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1578 data, u'video URL')
1579
1580 return [{
1581 'id': video_id,
1582 'url': video_url,
1583 'ext': 'mp4',
1584 'title': video_title,
1585 'thumbnail': thumbnail,
1586 'description': video_description,
1587 }]
1588
1589 class XHamsterIE(InfoExtractor):
1590 """Information Extractor for xHamster"""
1591 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1592
1593 def _real_extract(self,url):
1594 mobj = re.match(self._VALID_URL, url)
1595
1596 video_id = mobj.group('id')
1597 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1598 webpage = self._download_webpage(mrss_url, video_id)
1599
1600 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1601 if mobj is None:
1602 raise ExtractorError(u'Unable to extract media URL')
1603 if len(mobj.group('server')) == 0:
1604 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1605 else:
1606 video_url = mobj.group('server')+'/key='+mobj.group('file')
1607 video_extension = video_url.split('.')[-1]
1608
1609 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1610 webpage, u'title')
1611
1612 # Can't see the description anywhere in the UI
1613 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1614 # webpage, u'description', fatal=False)
1615 # if video_description: video_description = unescapeHTML(video_description)
1616
1617 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1618 if mobj:
1619 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1620 else:
1621 video_upload_date = None
1622 self._downloader.report_warning(u'Unable to extract upload date')
1623
1624 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1625 webpage, u'uploader id', default=u'anonymous')
1626
1627 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1628 webpage, u'thumbnail', fatal=False)
1629
1630 return [{
1631 'id': video_id,
1632 'url': video_url,
1633 'ext': video_extension,
1634 'title': video_title,
1635 # 'description': video_description,
1636 'upload_date': video_upload_date,
1637 'uploader_id': video_uploader_id,
1638 'thumbnail': video_thumbnail
1639 }]
1640
1641 class HypemIE(InfoExtractor):
1642 """Information Extractor for hypem"""
1643 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1644
1645 def _real_extract(self, url):
1646 mobj = re.match(self._VALID_URL, url)
1647 if mobj is None:
1648 raise ExtractorError(u'Invalid URL: %s' % url)
1649 track_id = mobj.group(1)
1650
1651 data = { 'ax': 1, 'ts': time.time() }
1652 data_encoded = compat_urllib_parse.urlencode(data)
1653 complete_url = url + "?" + data_encoded
1654 request = compat_urllib_request.Request(complete_url)
1655 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1656 cookie = urlh.headers.get('Set-Cookie', '')
1657
1658 self.report_extraction(track_id)
1659
1660 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1661 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1662 try:
1663 track_list = json.loads(html_tracks)
1664 track = track_list[u'tracks'][0]
1665 except ValueError:
1666 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1667
1668 key = track[u"key"]
1669 track_id = track[u"id"]
1670 artist = track[u"artist"]
1671 title = track[u"song"]
1672
1673 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1674 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1675 request.add_header('cookie', cookie)
1676 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1677 try:
1678 song_data = json.loads(song_data_json)
1679 except ValueError:
1680 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1681 final_url = song_data[u"url"]
1682
1683 return [{
1684 'id': track_id,
1685 'url': final_url,
1686 'ext': "mp3",
1687 'title': title,
1688 'artist': artist,
1689 }]
1690
1691 class Vbox7IE(InfoExtractor):
1692 """Information Extractor for Vbox7"""
1693 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1694
1695 def _real_extract(self,url):
1696 mobj = re.match(self._VALID_URL, url)
1697 if mobj is None:
1698 raise ExtractorError(u'Invalid URL: %s' % url)
1699 video_id = mobj.group(1)
1700
1701 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1702 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1703 redirect_url = urlh.geturl() + new_location
1704 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1705
1706 title = self._html_search_regex(r'<title>(.*)</title>',
1707 webpage, u'title').split('/')[0].strip()
1708
1709 ext = "flv"
1710 info_url = "http://vbox7.com/play/magare.do"
1711 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1712 info_request = compat_urllib_request.Request(info_url, data)
1713 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1714 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1715 if info_response is None:
1716 raise ExtractorError(u'Unable to extract the media url')
1717 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1718
1719 return [{
1720 'id': video_id,
1721 'url': final_url,
1722 'ext': ext,
1723 'title': title,
1724 'thumbnail': thumbnail_url,
1725 }]
1726
1727
1728 def gen_extractors():
1729 """ Return a list of an instance of every supported extractor.
1730 The order does matter; the first extractor matched is the one handling the URL.
1731 """
1732 return [
1733 YoutubePlaylistIE(),
1734 YoutubeChannelIE(),
1735 YoutubeUserIE(),
1736 YoutubeSearchIE(),
1737 YoutubeIE(),
1738 MetacafeIE(),
1739 DailymotionIE(),
1740 GoogleSearchIE(),
1741 PhotobucketIE(),
1742 YahooIE(),
1743 YahooSearchIE(),
1744 DepositFilesIE(),
1745 FacebookIE(),
1746 BlipTVIE(),
1747 BlipTVUserIE(),
1748 VimeoIE(),
1749 MyVideoIE(),
1750 ComedyCentralIE(),
1751 EscapistIE(),
1752 CollegeHumorIE(),
1753 XVideosIE(),
1754 SoundcloudSetIE(),
1755 SoundcloudIE(),
1756 InfoQIE(),
1757 MixcloudIE(),
1758 StanfordOpenClassroomIE(),
1759 MTVIE(),
1760 YoukuIE(),
1761 XNXXIE(),
1762 YouJizzIE(),
1763 PornotubeIE(),
1764 YouPornIE(),
1765 GooglePlusIE(),
1766 ArteTvIE(),
1767 NBAIE(),
1768 WorldStarHipHopIE(),
1769 JustinTVIE(),
1770 FunnyOrDieIE(),
1771 SteamIE(),
1772 UstreamIE(),
1773 RBMARadioIE(),
1774 EightTracksIE(),
1775 KeekIE(),
1776 TEDIE(),
1777 MySpassIE(),
1778 SpiegelIE(),
1779 LiveLeakIE(),
1780 ARDIE(),
1781 ZDFIE(),
1782 TumblrIE(),
1783 BandcampIE(),
1784 RedTubeIE(),
1785 InaIE(),
1786 HowcastIE(),
1787 VineIE(),
1788 FlickrIE(),
1789 TeamcocoIE(),
1790 XHamsterIE(),
1791 HypemIE(),
1792 Vbox7IE(),
1793 GametrailersIE(),
1794 StatigramIE(),
1795 GenericIE()
1796 ]
1797
1798 def get_info_extractor(ie_name):
1799 """Returns the info extractor class with the given ie_name"""
1800 return globals()[ie_name+'IE']