]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Move Facebook into its own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.googleplus import GooglePlusIE
29 from .extractor.googlesearch import GoogleSearchIE
30 from .extractor.metacafe import MetacafeIE
31 from .extractor.myvideo import MyVideoIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
35 from .extractor.vimeo import VimeoIE
36 from .extractor.yahoo import YahooIE, YahooSearchIE
37 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
38 from .extractor.zdf import ZDFIE
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58 class DepositFilesIE(InfoExtractor):
59 """Information extractor for depositfiles.com"""
60
61 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
62
63 def _real_extract(self, url):
64 file_id = url.split('/')[-1]
65 # Rebuild url in english locale
66 url = 'http://depositfiles.com/en/files/' + file_id
67
68 # Retrieve file webpage with 'Free download' button pressed
69 free_download_indication = { 'gateway_result' : '1' }
70 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
71 try:
72 self.report_download_webpage(file_id)
73 webpage = compat_urllib_request.urlopen(request).read()
74 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
75 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
76
77 # Search for the real file URL
78 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
79 if (mobj is None) or (mobj.group(1) is None):
80 # Try to figure out reason of the error.
81 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
82 if (mobj is not None) and (mobj.group(1) is not None):
83 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
84 raise ExtractorError(u'%s' % restriction_message)
85 else:
86 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
87
88 file_url = mobj.group(1)
89 file_extension = os.path.splitext(file_url)[1][1:]
90
91 # Search for file title
92 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
93
94 return [{
95 'id': file_id.decode('utf-8'),
96 'url': file_url.decode('utf-8'),
97 'uploader': None,
98 'upload_date': None,
99 'title': file_title,
100 'ext': file_extension.decode('utf-8'),
101 }]
102
103
104
105
106
107
108
109
110
111 class EscapistIE(InfoExtractor):
112 """Information extractor for The Escapist """
113
114 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
115 IE_NAME = u'escapist'
116
117 def _real_extract(self, url):
118 mobj = re.match(self._VALID_URL, url)
119 if mobj is None:
120 raise ExtractorError(u'Invalid URL: %s' % url)
121 showName = mobj.group('showname')
122 videoId = mobj.group('episode')
123
124 self.report_extraction(videoId)
125 webpage = self._download_webpage(url, videoId)
126
127 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
128 webpage, u'description', fatal=False)
129
130 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
131 webpage, u'thumbnail', fatal=False)
132
133 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
134 webpage, u'player url')
135
136 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
137 webpage, u'player url').split(' : ')[-1]
138
139 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
140 configUrl = compat_urllib_parse.unquote(configUrl)
141
142 configJSON = self._download_webpage(configUrl, videoId,
143 u'Downloading configuration',
144 u'unable to download configuration')
145
146 # Technically, it's JavaScript, not JSON
147 configJSON = configJSON.replace("'", '"')
148
149 try:
150 config = json.loads(configJSON)
151 except (ValueError,) as err:
152 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
153
154 playlist = config['playlist']
155 videoUrl = playlist[1]['url']
156
157 info = {
158 'id': videoId,
159 'url': videoUrl,
160 'uploader': showName,
161 'upload_date': None,
162 'title': title,
163 'ext': 'mp4',
164 'thumbnail': imgUrl,
165 'description': videoDesc,
166 'player_url': playerUrl,
167 }
168
169 return [info]
170
171 class CollegeHumorIE(InfoExtractor):
172 """Information extractor for collegehumor.com"""
173
174 _WORKING = False
175 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
176 IE_NAME = u'collegehumor'
177
178 def report_manifest(self, video_id):
179 """Report information extraction."""
180 self.to_screen(u'%s: Downloading XML manifest' % video_id)
181
182 def _real_extract(self, url):
183 mobj = re.match(self._VALID_URL, url)
184 if mobj is None:
185 raise ExtractorError(u'Invalid URL: %s' % url)
186 video_id = mobj.group('videoid')
187
188 info = {
189 'id': video_id,
190 'uploader': None,
191 'upload_date': None,
192 }
193
194 self.report_extraction(video_id)
195 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
196 try:
197 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
198 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
199 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
200
201 mdoc = xml.etree.ElementTree.fromstring(metaXml)
202 try:
203 videoNode = mdoc.findall('./video')[0]
204 info['description'] = videoNode.findall('./description')[0].text
205 info['title'] = videoNode.findall('./caption')[0].text
206 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
207 manifest_url = videoNode.findall('./file')[0].text
208 except IndexError:
209 raise ExtractorError(u'Invalid metadata XML file')
210
211 manifest_url += '?hdcore=2.10.3'
212 self.report_manifest(video_id)
213 try:
214 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
216 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
217
218 adoc = xml.etree.ElementTree.fromstring(manifestXml)
219 try:
220 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
221 node_id = media_node.attrib['url']
222 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
223 except IndexError as err:
224 raise ExtractorError(u'Invalid manifest file')
225
226 url_pr = compat_urllib_parse_urlparse(manifest_url)
227 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
228
229 info['url'] = url
230 info['ext'] = 'f4f'
231 return [info]
232
233
234 class XVideosIE(InfoExtractor):
235 """Information extractor for xvideos.com"""
236
237 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
238 IE_NAME = u'xvideos'
239
240 def _real_extract(self, url):
241 mobj = re.match(self._VALID_URL, url)
242 if mobj is None:
243 raise ExtractorError(u'Invalid URL: %s' % url)
244 video_id = mobj.group(1)
245
246 webpage = self._download_webpage(url, video_id)
247
248 self.report_extraction(video_id)
249
250 # Extract video URL
251 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
252 webpage, u'video URL'))
253
254 # Extract title
255 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
256 webpage, u'title')
257
258 # Extract video thumbnail
259 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
260 webpage, u'thumbnail', fatal=False)
261
262 info = {
263 'id': video_id,
264 'url': video_url,
265 'uploader': None,
266 'upload_date': None,
267 'title': video_title,
268 'ext': 'flv',
269 'thumbnail': video_thumbnail,
270 'description': None,
271 }
272
273 return [info]
274
275
276
277
278 class InfoQIE(InfoExtractor):
279 """Information extractor for infoq.com"""
280 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
281
282 def _real_extract(self, url):
283 mobj = re.match(self._VALID_URL, url)
284 if mobj is None:
285 raise ExtractorError(u'Invalid URL: %s' % url)
286
287 webpage = self._download_webpage(url, video_id=url)
288 self.report_extraction(url)
289
290 # Extract video URL
291 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
292 if mobj is None:
293 raise ExtractorError(u'Unable to extract video url')
294 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
295 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
296
297 # Extract title
298 video_title = self._search_regex(r'contentTitle = "(.*?)";',
299 webpage, u'title')
300
301 # Extract description
302 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
303 webpage, u'description', fatal=False)
304
305 video_filename = video_url.split('/')[-1]
306 video_id, extension = video_filename.split('.')
307
308 info = {
309 'id': video_id,
310 'url': video_url,
311 'uploader': None,
312 'upload_date': None,
313 'title': video_title,
314 'ext': extension, # Extension is always(?) mp4, but seems to be flv
315 'thumbnail': None,
316 'description': video_description,
317 }
318
319 return [info]
320
321 class MixcloudIE(InfoExtractor):
322 """Information extractor for www.mixcloud.com"""
323
324 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
325 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
326 IE_NAME = u'mixcloud'
327
328 def report_download_json(self, file_id):
329 """Report JSON download."""
330 self.to_screen(u'Downloading json')
331
332 def get_urls(self, jsonData, fmt, bitrate='best'):
333 """Get urls from 'audio_formats' section in json"""
334 file_url = None
335 try:
336 bitrate_list = jsonData[fmt]
337 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
338 bitrate = max(bitrate_list) # select highest
339
340 url_list = jsonData[fmt][bitrate]
341 except TypeError: # we have no bitrate info.
342 url_list = jsonData[fmt]
343 return url_list
344
345 def check_urls(self, url_list):
346 """Returns 1st active url from list"""
347 for url in url_list:
348 try:
349 compat_urllib_request.urlopen(url)
350 return url
351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
352 url = None
353
354 return None
355
356 def _print_formats(self, formats):
357 print('Available formats:')
358 for fmt in formats.keys():
359 for b in formats[fmt]:
360 try:
361 ext = formats[fmt][b][0]
362 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
363 except TypeError: # we have no bitrate info
364 ext = formats[fmt][0]
365 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
366 break
367
368 def _real_extract(self, url):
369 mobj = re.match(self._VALID_URL, url)
370 if mobj is None:
371 raise ExtractorError(u'Invalid URL: %s' % url)
372 # extract uploader & filename from url
373 uploader = mobj.group(1).decode('utf-8')
374 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
375
376 # construct API request
377 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
378 # retrieve .json file with links to files
379 request = compat_urllib_request.Request(file_url)
380 try:
381 self.report_download_json(file_url)
382 jsonData = compat_urllib_request.urlopen(request).read()
383 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
384 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
385
386 # parse JSON
387 json_data = json.loads(jsonData)
388 player_url = json_data['player_swf_url']
389 formats = dict(json_data['audio_formats'])
390
391 req_format = self._downloader.params.get('format', None)
392 bitrate = None
393
394 if self._downloader.params.get('listformats', None):
395 self._print_formats(formats)
396 return
397
398 if req_format is None or req_format == 'best':
399 for format_param in formats.keys():
400 url_list = self.get_urls(formats, format_param)
401 # check urls
402 file_url = self.check_urls(url_list)
403 if file_url is not None:
404 break # got it!
405 else:
406 if req_format not in formats:
407 raise ExtractorError(u'Format is not available')
408
409 url_list = self.get_urls(formats, req_format)
410 file_url = self.check_urls(url_list)
411 format_param = req_format
412
413 return [{
414 'id': file_id.decode('utf-8'),
415 'url': file_url.decode('utf-8'),
416 'uploader': uploader.decode('utf-8'),
417 'upload_date': None,
418 'title': json_data['name'],
419 'ext': file_url.split('.')[-1].decode('utf-8'),
420 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
421 'thumbnail': json_data['thumbnail_url'],
422 'description': json_data['description'],
423 'player_url': player_url.decode('utf-8'),
424 }]
425
426 class StanfordOpenClassroomIE(InfoExtractor):
427 """Information extractor for Stanford's Open ClassRoom"""
428
429 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
430 IE_NAME = u'stanfordoc'
431
432 def _real_extract(self, url):
433 mobj = re.match(self._VALID_URL, url)
434 if mobj is None:
435 raise ExtractorError(u'Invalid URL: %s' % url)
436
437 if mobj.group('course') and mobj.group('video'): # A specific video
438 course = mobj.group('course')
439 video = mobj.group('video')
440 info = {
441 'id': course + '_' + video,
442 'uploader': None,
443 'upload_date': None,
444 }
445
446 self.report_extraction(info['id'])
447 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
448 xmlUrl = baseUrl + video + '.xml'
449 try:
450 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
452 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
453 mdoc = xml.etree.ElementTree.fromstring(metaXml)
454 try:
455 info['title'] = mdoc.findall('./title')[0].text
456 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
457 except IndexError:
458 raise ExtractorError(u'Invalid metadata XML file')
459 info['ext'] = info['url'].rpartition('.')[2]
460 return [info]
461 elif mobj.group('course'): # A course page
462 course = mobj.group('course')
463 info = {
464 'id': course,
465 'type': 'playlist',
466 'uploader': None,
467 'upload_date': None,
468 }
469
470 coursepage = self._download_webpage(url, info['id'],
471 note='Downloading course info page',
472 errnote='Unable to download course info page')
473
474 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
475
476 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
477 coursepage, u'description', fatal=False)
478
479 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
480 info['list'] = [
481 {
482 'type': 'reference',
483 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
484 }
485 for vpage in links]
486 results = []
487 for entry in info['list']:
488 assert entry['type'] == 'reference'
489 results += self.extract(entry['url'])
490 return results
491 else: # Root page
492 info = {
493 'id': 'Stanford OpenClassroom',
494 'type': 'playlist',
495 'uploader': None,
496 'upload_date': None,
497 }
498
499 self.report_download_webpage(info['id'])
500 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
501 try:
502 rootpage = compat_urllib_request.urlopen(rootURL).read()
503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
505
506 info['title'] = info['id']
507
508 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
509 info['list'] = [
510 {
511 'type': 'reference',
512 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
513 }
514 for cpage in links]
515
516 results = []
517 for entry in info['list']:
518 assert entry['type'] == 'reference'
519 results += self.extract(entry['url'])
520 return results
521
522 class MTVIE(InfoExtractor):
523 """Information extractor for MTV.com"""
524
525 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
526 IE_NAME = u'mtv'
527
528 def _real_extract(self, url):
529 mobj = re.match(self._VALID_URL, url)
530 if mobj is None:
531 raise ExtractorError(u'Invalid URL: %s' % url)
532 if not mobj.group('proto'):
533 url = 'http://' + url
534 video_id = mobj.group('videoid')
535
536 webpage = self._download_webpage(url, video_id)
537
538 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
539 webpage, u'song name', fatal=False)
540
541 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
542 webpage, u'title')
543
544 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
545 webpage, u'mtvn_uri', fatal=False)
546
547 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
548 webpage, u'content id', fatal=False)
549
550 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
551 self.report_extraction(video_id)
552 request = compat_urllib_request.Request(videogen_url)
553 try:
554 metadataXml = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
557
558 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
559 renditions = mdoc.findall('.//rendition')
560
561 # For now, always pick the highest quality.
562 rendition = renditions[-1]
563
564 try:
565 _,_,ext = rendition.attrib['type'].partition('/')
566 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
567 video_url = rendition.find('./src').text
568 except KeyError:
569 raise ExtractorError('Invalid rendition field.')
570
571 info = {
572 'id': video_id,
573 'url': video_url,
574 'uploader': performer,
575 'upload_date': None,
576 'title': video_title,
577 'ext': ext,
578 'format': format,
579 }
580
581 return [info]
582
583
584 class YoukuIE(InfoExtractor):
585 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
586
587 def _gen_sid(self):
588 nowTime = int(time.time() * 1000)
589 random1 = random.randint(1000,1998)
590 random2 = random.randint(1000,9999)
591
592 return "%d%d%d" %(nowTime,random1,random2)
593
594 def _get_file_ID_mix_string(self, seed):
595 mixed = []
596 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
597 seed = float(seed)
598 for i in range(len(source)):
599 seed = (seed * 211 + 30031 ) % 65536
600 index = math.floor(seed / 65536 * len(source) )
601 mixed.append(source[int(index)])
602 source.remove(source[int(index)])
603 #return ''.join(mixed)
604 return mixed
605
606 def _get_file_id(self, fileId, seed):
607 mixed = self._get_file_ID_mix_string(seed)
608 ids = fileId.split('*')
609 realId = []
610 for ch in ids:
611 if ch:
612 realId.append(mixed[int(ch)])
613 return ''.join(realId)
614
615 def _real_extract(self, url):
616 mobj = re.match(self._VALID_URL, url)
617 if mobj is None:
618 raise ExtractorError(u'Invalid URL: %s' % url)
619 video_id = mobj.group('ID')
620
621 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
622
623 jsondata = self._download_webpage(info_url, video_id)
624
625 self.report_extraction(video_id)
626 try:
627 config = json.loads(jsondata)
628
629 video_title = config['data'][0]['title']
630 seed = config['data'][0]['seed']
631
632 format = self._downloader.params.get('format', None)
633 supported_format = list(config['data'][0]['streamfileids'].keys())
634
635 if format is None or format == 'best':
636 if 'hd2' in supported_format:
637 format = 'hd2'
638 else:
639 format = 'flv'
640 ext = u'flv'
641 elif format == 'worst':
642 format = 'mp4'
643 ext = u'mp4'
644 else:
645 format = 'flv'
646 ext = u'flv'
647
648
649 fileid = config['data'][0]['streamfileids'][format]
650 keys = [s['k'] for s in config['data'][0]['segs'][format]]
651 except (UnicodeDecodeError, ValueError, KeyError):
652 raise ExtractorError(u'Unable to extract info section')
653
654 files_info=[]
655 sid = self._gen_sid()
656 fileid = self._get_file_id(fileid, seed)
657
658 #column 8,9 of fileid represent the segment number
659 #fileid[7:9] should be changed
660 for index, key in enumerate(keys):
661
662 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
663 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
664
665 info = {
666 'id': '%s_part%02d' % (video_id, index),
667 'url': download_url,
668 'uploader': None,
669 'upload_date': None,
670 'title': video_title,
671 'ext': ext,
672 }
673 files_info.append(info)
674
675 return files_info
676
677
678 class XNXXIE(InfoExtractor):
679 """Information extractor for xnxx.com"""
680
681 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
682 IE_NAME = u'xnxx'
683 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
684 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
685 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
686
687 def _real_extract(self, url):
688 mobj = re.match(self._VALID_URL, url)
689 if mobj is None:
690 raise ExtractorError(u'Invalid URL: %s' % url)
691 video_id = mobj.group(1)
692
693 # Get webpage content
694 webpage = self._download_webpage(url, video_id)
695
696 video_url = self._search_regex(self.VIDEO_URL_RE,
697 webpage, u'video URL')
698 video_url = compat_urllib_parse.unquote(video_url)
699
700 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
701 webpage, u'title')
702
703 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
704 webpage, u'thumbnail', fatal=False)
705
706 return [{
707 'id': video_id,
708 'url': video_url,
709 'uploader': None,
710 'upload_date': None,
711 'title': video_title,
712 'ext': 'flv',
713 'thumbnail': video_thumbnail,
714 'description': None,
715 }]
716
717
718
719 class NBAIE(InfoExtractor):
720 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
721 IE_NAME = u'nba'
722
723 def _real_extract(self, url):
724 mobj = re.match(self._VALID_URL, url)
725 if mobj is None:
726 raise ExtractorError(u'Invalid URL: %s' % url)
727
728 video_id = mobj.group(1)
729
730 webpage = self._download_webpage(url, video_id)
731
732 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
733
734 shortened_video_id = video_id.rpartition('/')[2]
735 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
736 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
737
738 # It isn't there in the HTML it returns to us
739 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
740
741 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
742
743 info = {
744 'id': shortened_video_id,
745 'url': video_url,
746 'ext': 'mp4',
747 'title': title,
748 # 'uploader_date': uploader_date,
749 'description': description,
750 }
751 return [info]
752
753 class JustinTVIE(InfoExtractor):
754 """Information extractor for justin.tv and twitch.tv"""
755 # TODO: One broadcast may be split into multiple videos. The key
756 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
757 # starts at 1 and increases. Can we treat all parts as one video?
758
759 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
760 (?:
761 (?P<channelid>[^/]+)|
762 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
763 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
764 )
765 /?(?:\#.*)?$
766 """
767 _JUSTIN_PAGE_LIMIT = 100
768 IE_NAME = u'justin.tv'
769
770 def report_download_page(self, channel, offset):
771 """Report attempt to download a single page of videos."""
772 self.to_screen(u'%s: Downloading video information from %d to %d' %
773 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
774
775 # Return count of items, list of *valid* items
776 def _parse_page(self, url, video_id):
777 webpage = self._download_webpage(url, video_id,
778 u'Downloading video info JSON',
779 u'unable to download video info JSON')
780
781 response = json.loads(webpage)
782 if type(response) != list:
783 error_text = response.get('error', 'unknown error')
784 raise ExtractorError(u'Justin.tv API: %s' % error_text)
785 info = []
786 for clip in response:
787 video_url = clip['video_file_url']
788 if video_url:
789 video_extension = os.path.splitext(video_url)[1][1:]
790 video_date = re.sub('-', '', clip['start_time'][:10])
791 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
792 video_id = clip['id']
793 video_title = clip.get('title', video_id)
794 info.append({
795 'id': video_id,
796 'url': video_url,
797 'title': video_title,
798 'uploader': clip.get('channel_name', video_uploader_id),
799 'uploader_id': video_uploader_id,
800 'upload_date': video_date,
801 'ext': video_extension,
802 })
803 return (len(response), info)
804
805 def _real_extract(self, url):
806 mobj = re.match(self._VALID_URL, url)
807 if mobj is None:
808 raise ExtractorError(u'invalid URL: %s' % url)
809
810 api_base = 'http://api.justin.tv'
811 paged = False
812 if mobj.group('channelid'):
813 paged = True
814 video_id = mobj.group('channelid')
815 api = api_base + '/channel/archives/%s.json' % video_id
816 elif mobj.group('chapterid'):
817 chapter_id = mobj.group('chapterid')
818
819 webpage = self._download_webpage(url, chapter_id)
820 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
821 if not m:
822 raise ExtractorError(u'Cannot find archive of a chapter')
823 archive_id = m.group(1)
824
825 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
826 chapter_info_xml = self._download_webpage(api, chapter_id,
827 note=u'Downloading chapter information',
828 errnote=u'Chapter information download failed')
829 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
830 for a in doc.findall('.//archive'):
831 if archive_id == a.find('./id').text:
832 break
833 else:
834 raise ExtractorError(u'Could not find chapter in chapter information')
835
836 video_url = a.find('./video_file_url').text
837 video_ext = video_url.rpartition('.')[2] or u'flv'
838
839 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
840 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
841 note='Downloading chapter metadata',
842 errnote='Download of chapter metadata failed')
843 chapter_info = json.loads(chapter_info_json)
844
845 bracket_start = int(doc.find('.//bracket_start').text)
846 bracket_end = int(doc.find('.//bracket_end').text)
847
848 # TODO determine start (and probably fix up file)
849 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
850 #video_url += u'?start=' + TODO:start_timestamp
851 # bracket_start is 13290, but we want 51670615
852 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
853 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
854
855 info = {
856 'id': u'c' + chapter_id,
857 'url': video_url,
858 'ext': video_ext,
859 'title': chapter_info['title'],
860 'thumbnail': chapter_info['preview'],
861 'description': chapter_info['description'],
862 'uploader': chapter_info['channel']['display_name'],
863 'uploader_id': chapter_info['channel']['name'],
864 }
865 return [info]
866 else:
867 video_id = mobj.group('videoid')
868 api = api_base + '/broadcast/by_archive/%s.json' % video_id
869
870 self.report_extraction(video_id)
871
872 info = []
873 offset = 0
874 limit = self._JUSTIN_PAGE_LIMIT
875 while True:
876 if paged:
877 self.report_download_page(video_id, offset)
878 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
879 page_count, page_info = self._parse_page(page_url, video_id)
880 info.extend(page_info)
881 if not paged or page_count != limit:
882 break
883 offset += limit
884 return info
885
886 class FunnyOrDieIE(InfoExtractor):
887 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
888
889 def _real_extract(self, url):
890 mobj = re.match(self._VALID_URL, url)
891 if mobj is None:
892 raise ExtractorError(u'invalid URL: %s' % url)
893
894 video_id = mobj.group('id')
895 webpage = self._download_webpage(url, video_id)
896
897 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
898 webpage, u'video URL', flags=re.DOTALL)
899
900 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
901 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
902
903 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
904 webpage, u'description', fatal=False, flags=re.DOTALL)
905
906 info = {
907 'id': video_id,
908 'url': video_url,
909 'ext': 'mp4',
910 'title': title,
911 'description': video_description,
912 }
913 return [info]
914
915 class SteamIE(InfoExtractor):
916 _VALID_URL = r"""http://store\.steampowered\.com/
917 (agecheck/)?
918 (?P<urltype>video|app)/ #If the page is only for videos or for a game
919 (?P<gameID>\d+)/?
920 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
921 """
922 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
923 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
924
925 @classmethod
926 def suitable(cls, url):
927 """Receives a URL and returns True if suitable for this IE."""
928 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
929
930 def _real_extract(self, url):
931 m = re.match(self._VALID_URL, url, re.VERBOSE)
932 gameID = m.group('gameID')
933
934 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
935 webpage = self._download_webpage(videourl, gameID)
936
937 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
938 videourl = self._AGECHECK_TEMPLATE % gameID
939 self.report_age_confirmation()
940 webpage = self._download_webpage(videourl, gameID)
941
942 self.report_extraction(gameID)
943 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
944 webpage, 'game title')
945
946 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
947 mweb = re.finditer(urlRE, webpage)
948 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
949 titles = re.finditer(namesRE, webpage)
950 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
951 thumbs = re.finditer(thumbsRE, webpage)
952 videos = []
953 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
954 video_id = vid.group('videoID')
955 title = vtitle.group('videoName')
956 video_url = vid.group('videoURL')
957 video_thumb = thumb.group('thumbnail')
958 if not video_url:
959 raise ExtractorError(u'Cannot find video url for %s' % video_id)
960 info = {
961 'id':video_id,
962 'url':video_url,
963 'ext': 'flv',
964 'title': unescapeHTML(title),
965 'thumbnail': video_thumb
966 }
967 videos.append(info)
968 return [self.playlist_result(videos, gameID, game_title)]
969
970 class UstreamIE(InfoExtractor):
971 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
972 IE_NAME = u'ustream'
973
974 def _real_extract(self, url):
975 m = re.match(self._VALID_URL, url)
976 video_id = m.group('videoID')
977
978 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
979 webpage = self._download_webpage(url, video_id)
980
981 self.report_extraction(video_id)
982
983 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
984 webpage, u'title')
985
986 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
987 webpage, u'uploader', fatal=False, flags=re.DOTALL)
988
989 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
990 webpage, u'thumbnail', fatal=False)
991
992 info = {
993 'id': video_id,
994 'url': video_url,
995 'ext': 'flv',
996 'title': video_title,
997 'uploader': uploader,
998 'thumbnail': thumbnail,
999 }
1000 return info
1001
1002 class WorldStarHipHopIE(InfoExtractor):
1003 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1004 IE_NAME = u'WorldStarHipHop'
1005
1006 def _real_extract(self, url):
1007 m = re.match(self._VALID_URL, url)
1008 video_id = m.group('id')
1009
1010 webpage_src = self._download_webpage(url, video_id)
1011
1012 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1013 webpage_src, u'video URL')
1014
1015 if 'mp4' in video_url:
1016 ext = 'mp4'
1017 else:
1018 ext = 'flv'
1019
1020 video_title = self._html_search_regex(r"<title>(.*)</title>",
1021 webpage_src, u'title')
1022
1023 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1024 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1025 webpage_src, u'thumbnail', fatal=False)
1026
1027 if not thumbnail:
1028 _title = r"""candytitles.*>(.*)</span>"""
1029 mobj = re.search(_title, webpage_src)
1030 if mobj is not None:
1031 video_title = mobj.group(1)
1032
1033 results = [{
1034 'id': video_id,
1035 'url' : video_url,
1036 'title' : video_title,
1037 'thumbnail' : thumbnail,
1038 'ext' : ext,
1039 }]
1040 return results
1041
1042 class RBMARadioIE(InfoExtractor):
1043 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1044
1045 def _real_extract(self, url):
1046 m = re.match(self._VALID_URL, url)
1047 video_id = m.group('videoID')
1048
1049 webpage = self._download_webpage(url, video_id)
1050
1051 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1052 webpage, u'json data', flags=re.MULTILINE)
1053
1054 try:
1055 data = json.loads(json_data)
1056 except ValueError as e:
1057 raise ExtractorError(u'Invalid JSON: ' + str(e))
1058
1059 video_url = data['akamai_url'] + '&cbr=256'
1060 url_parts = compat_urllib_parse_urlparse(video_url)
1061 video_ext = url_parts.path.rpartition('.')[2]
1062 info = {
1063 'id': video_id,
1064 'url': video_url,
1065 'ext': video_ext,
1066 'title': data['title'],
1067 'description': data.get('teaser_text'),
1068 'location': data.get('country_of_origin'),
1069 'uploader': data.get('host', {}).get('name'),
1070 'uploader_id': data.get('host', {}).get('slug'),
1071 'thumbnail': data.get('image', {}).get('large_url_2x'),
1072 'duration': data.get('duration'),
1073 }
1074 return [info]
1075
1076
1077 class YouPornIE(InfoExtractor):
1078 """Information extractor for youporn.com."""
1079 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1080
1081 def _print_formats(self, formats):
1082 """Print all available formats"""
1083 print(u'Available formats:')
1084 print(u'ext\t\tformat')
1085 print(u'---------------------------------')
1086 for format in formats:
1087 print(u'%s\t\t%s' % (format['ext'], format['format']))
1088
1089 def _specific(self, req_format, formats):
1090 for x in formats:
1091 if(x["format"]==req_format):
1092 return x
1093 return None
1094
1095 def _real_extract(self, url):
1096 mobj = re.match(self._VALID_URL, url)
1097 if mobj is None:
1098 raise ExtractorError(u'Invalid URL: %s' % url)
1099 video_id = mobj.group('videoid')
1100
1101 req = compat_urllib_request.Request(url)
1102 req.add_header('Cookie', 'age_verified=1')
1103 webpage = self._download_webpage(req, video_id)
1104
1105 # Get JSON parameters
1106 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1107 try:
1108 params = json.loads(json_params)
1109 except:
1110 raise ExtractorError(u'Invalid JSON')
1111
1112 self.report_extraction(video_id)
1113 try:
1114 video_title = params['title']
1115 upload_date = unified_strdate(params['release_date_f'])
1116 video_description = params['description']
1117 video_uploader = params['submitted_by']
1118 thumbnail = params['thumbnails'][0]['image']
1119 except KeyError:
1120 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1121
1122 # Get all of the formats available
1123 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1124 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1125 webpage, u'download list').strip()
1126
1127 # Get all of the links from the page
1128 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1129 links = re.findall(LINK_RE, download_list_html)
1130 if(len(links) == 0):
1131 raise ExtractorError(u'ERROR: no known formats available for video')
1132
1133 self.to_screen(u'Links found: %d' % len(links))
1134
1135 formats = []
1136 for link in links:
1137
1138 # A link looks like this:
1139 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1140 # A path looks like this:
1141 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1142 video_url = unescapeHTML( link )
1143 path = compat_urllib_parse_urlparse( video_url ).path
1144 extension = os.path.splitext( path )[1][1:]
1145 format = path.split('/')[4].split('_')[:2]
1146 size = format[0]
1147 bitrate = format[1]
1148 format = "-".join( format )
1149 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1150
1151 formats.append({
1152 'id': video_id,
1153 'url': video_url,
1154 'uploader': video_uploader,
1155 'upload_date': upload_date,
1156 'title': video_title,
1157 'ext': extension,
1158 'format': format,
1159 'thumbnail': thumbnail,
1160 'description': video_description
1161 })
1162
1163 if self._downloader.params.get('listformats', None):
1164 self._print_formats(formats)
1165 return
1166
1167 req_format = self._downloader.params.get('format', None)
1168 self.to_screen(u'Format: %s' % req_format)
1169
1170 if req_format is None or req_format == 'best':
1171 return [formats[0]]
1172 elif req_format == 'worst':
1173 return [formats[-1]]
1174 elif req_format in ('-1', 'all'):
1175 return formats
1176 else:
1177 format = self._specific( req_format, formats )
1178 if result is None:
1179 raise ExtractorError(u'Requested format not available')
1180 return [format]
1181
1182
1183
1184 class PornotubeIE(InfoExtractor):
1185 """Information extractor for pornotube.com."""
1186 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1187
1188 def _real_extract(self, url):
1189 mobj = re.match(self._VALID_URL, url)
1190 if mobj is None:
1191 raise ExtractorError(u'Invalid URL: %s' % url)
1192
1193 video_id = mobj.group('videoid')
1194 video_title = mobj.group('title')
1195
1196 # Get webpage content
1197 webpage = self._download_webpage(url, video_id)
1198
1199 # Get the video URL
1200 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1201 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1202 video_url = compat_urllib_parse.unquote(video_url)
1203
1204 #Get the uploaded date
1205 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1206 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1207 if upload_date: upload_date = unified_strdate(upload_date)
1208
1209 info = {'id': video_id,
1210 'url': video_url,
1211 'uploader': None,
1212 'upload_date': upload_date,
1213 'title': video_title,
1214 'ext': 'flv',
1215 'format': 'flv'}
1216
1217 return [info]
1218
1219 class YouJizzIE(InfoExtractor):
1220 """Information extractor for youjizz.com."""
1221 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1222
1223 def _real_extract(self, url):
1224 mobj = re.match(self._VALID_URL, url)
1225 if mobj is None:
1226 raise ExtractorError(u'Invalid URL: %s' % url)
1227
1228 video_id = mobj.group('videoid')
1229
1230 # Get webpage content
1231 webpage = self._download_webpage(url, video_id)
1232
1233 # Get the video title
1234 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1235 webpage, u'title').strip()
1236
1237 # Get the embed page
1238 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1239 if result is None:
1240 raise ExtractorError(u'ERROR: unable to extract embed page')
1241
1242 embed_page_url = result.group(0).strip()
1243 video_id = result.group('videoid')
1244
1245 webpage = self._download_webpage(embed_page_url, video_id)
1246
1247 # Get the video URL
1248 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1249 webpage, u'video URL')
1250
1251 info = {'id': video_id,
1252 'url': video_url,
1253 'title': video_title,
1254 'ext': 'flv',
1255 'format': 'flv',
1256 'player_url': embed_page_url}
1257
1258 return [info]
1259
1260 class EightTracksIE(InfoExtractor):
1261 IE_NAME = '8tracks'
1262 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1263
1264 def _real_extract(self, url):
1265 mobj = re.match(self._VALID_URL, url)
1266 if mobj is None:
1267 raise ExtractorError(u'Invalid URL: %s' % url)
1268 playlist_id = mobj.group('id')
1269
1270 webpage = self._download_webpage(url, playlist_id)
1271
1272 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1273 data = json.loads(json_like)
1274
1275 session = str(random.randint(0, 1000000000))
1276 mix_id = data['id']
1277 track_count = data['tracks_count']
1278 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1279 next_url = first_url
1280 res = []
1281 for i in itertools.count():
1282 api_json = self._download_webpage(next_url, playlist_id,
1283 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1284 errnote=u'Failed to download song information')
1285 api_data = json.loads(api_json)
1286 track_data = api_data[u'set']['track']
1287 info = {
1288 'id': track_data['id'],
1289 'url': track_data['track_file_stream_url'],
1290 'title': track_data['performer'] + u' - ' + track_data['name'],
1291 'raw_title': track_data['name'],
1292 'uploader_id': data['user']['login'],
1293 'ext': 'm4a',
1294 }
1295 res.append(info)
1296 if api_data['set']['at_last_track']:
1297 break
1298 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1299 return res
1300
1301 class KeekIE(InfoExtractor):
1302 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1303 IE_NAME = u'keek'
1304
1305 def _real_extract(self, url):
1306 m = re.match(self._VALID_URL, url)
1307 video_id = m.group('videoID')
1308
1309 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1310 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1311 webpage = self._download_webpage(url, video_id)
1312
1313 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1314 webpage, u'title')
1315
1316 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1317 webpage, u'uploader', fatal=False)
1318
1319 info = {
1320 'id': video_id,
1321 'url': video_url,
1322 'ext': 'mp4',
1323 'title': video_title,
1324 'thumbnail': thumbnail,
1325 'uploader': uploader
1326 }
1327 return [info]
1328
1329 class TEDIE(InfoExtractor):
1330 _VALID_URL=r'''http://www\.ted\.com/
1331 (
1332 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1333 |
1334 ((?P<type_talk>talks)) # We have a simple talk
1335 )
1336 (/lang/(.*?))? # The url may contain the language
1337 /(?P<name>\w+) # Here goes the name and then ".html"
1338 '''
1339
1340 @classmethod
1341 def suitable(cls, url):
1342 """Receives a URL and returns True if suitable for this IE."""
1343 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1344
1345 def _real_extract(self, url):
1346 m=re.match(self._VALID_URL, url, re.VERBOSE)
1347 if m.group('type_talk'):
1348 return [self._talk_info(url)]
1349 else :
1350 playlist_id=m.group('playlist_id')
1351 name=m.group('name')
1352 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1353 return [self._playlist_videos_info(url,name,playlist_id)]
1354
1355 def _playlist_videos_info(self,url,name,playlist_id=0):
1356 '''Returns the videos of the playlist'''
1357 video_RE=r'''
1358 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1359 ([.\s]*?)data-playlist_item_id="(\d+)"
1360 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1361 '''
1362 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1363 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1364 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1365 m_names=re.finditer(video_name_RE,webpage)
1366
1367 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1368 webpage, 'playlist title')
1369
1370 playlist_entries = []
1371 for m_video, m_name in zip(m_videos,m_names):
1372 video_id=m_video.group('video_id')
1373 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1374 playlist_entries.append(self.url_result(talk_url, 'TED'))
1375 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1376
1377 def _talk_info(self, url, video_id=0):
1378 """Return the video for the talk in the url"""
1379 m = re.match(self._VALID_URL, url,re.VERBOSE)
1380 video_name = m.group('name')
1381 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1382 self.report_extraction(video_name)
1383 # If the url includes the language we get the title translated
1384 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1385 webpage, 'title')
1386 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1387 webpage, 'json data')
1388 info = json.loads(json_data)
1389 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1390 webpage, 'description', flags = re.DOTALL)
1391
1392 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1393 webpage, 'thumbnail')
1394 info = {
1395 'id': info['id'],
1396 'url': info['htmlStreams'][-1]['file'],
1397 'ext': 'mp4',
1398 'title': title,
1399 'thumbnail': thumbnail,
1400 'description': desc,
1401 }
1402 return info
1403
1404 class MySpassIE(InfoExtractor):
1405 _VALID_URL = r'http://www.myspass.de/.*'
1406
1407 def _real_extract(self, url):
1408 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1409
1410 # video id is the last path element of the URL
1411 # usually there is a trailing slash, so also try the second but last
1412 url_path = compat_urllib_parse_urlparse(url).path
1413 url_parent_path, video_id = os.path.split(url_path)
1414 if not video_id:
1415 _, video_id = os.path.split(url_parent_path)
1416
1417 # get metadata
1418 metadata_url = META_DATA_URL_TEMPLATE % video_id
1419 metadata_text = self._download_webpage(metadata_url, video_id)
1420 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1421
1422 # extract values from metadata
1423 url_flv_el = metadata.find('url_flv')
1424 if url_flv_el is None:
1425 raise ExtractorError(u'Unable to extract download url')
1426 video_url = url_flv_el.text
1427 extension = os.path.splitext(video_url)[1][1:]
1428 title_el = metadata.find('title')
1429 if title_el is None:
1430 raise ExtractorError(u'Unable to extract title')
1431 title = title_el.text
1432 format_id_el = metadata.find('format_id')
1433 if format_id_el is None:
1434 format = ext
1435 else:
1436 format = format_id_el.text
1437 description_el = metadata.find('description')
1438 if description_el is not None:
1439 description = description_el.text
1440 else:
1441 description = None
1442 imagePreview_el = metadata.find('imagePreview')
1443 if imagePreview_el is not None:
1444 thumbnail = imagePreview_el.text
1445 else:
1446 thumbnail = None
1447 info = {
1448 'id': video_id,
1449 'url': video_url,
1450 'title': title,
1451 'ext': extension,
1452 'format': format,
1453 'thumbnail': thumbnail,
1454 'description': description
1455 }
1456 return [info]
1457
1458 class SpiegelIE(InfoExtractor):
1459 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1460
1461 def _real_extract(self, url):
1462 m = re.match(self._VALID_URL, url)
1463 video_id = m.group('videoID')
1464
1465 webpage = self._download_webpage(url, video_id)
1466
1467 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1468 webpage, u'title')
1469
1470 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1471 xml_code = self._download_webpage(xml_url, video_id,
1472 note=u'Downloading XML', errnote=u'Failed to download XML')
1473
1474 idoc = xml.etree.ElementTree.fromstring(xml_code)
1475 last_type = idoc[-1]
1476 filename = last_type.findall('./filename')[0].text
1477 duration = float(last_type.findall('./duration')[0].text)
1478
1479 video_url = 'http://video2.spiegel.de/flash/' + filename
1480 video_ext = filename.rpartition('.')[2]
1481 info = {
1482 'id': video_id,
1483 'url': video_url,
1484 'ext': video_ext,
1485 'title': video_title,
1486 'duration': duration,
1487 }
1488 return [info]
1489
1490 class LiveLeakIE(InfoExtractor):
1491
1492 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1493 IE_NAME = u'liveleak'
1494
1495 def _real_extract(self, url):
1496 mobj = re.match(self._VALID_URL, url)
1497 if mobj is None:
1498 raise ExtractorError(u'Invalid URL: %s' % url)
1499
1500 video_id = mobj.group('video_id')
1501
1502 webpage = self._download_webpage(url, video_id)
1503
1504 video_url = self._search_regex(r'file: "(.*?)",',
1505 webpage, u'video URL')
1506
1507 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1508 webpage, u'title').replace('LiveLeak.com -', '').strip()
1509
1510 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1511 webpage, u'description', fatal=False)
1512
1513 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1514 webpage, u'uploader', fatal=False)
1515
1516 info = {
1517 'id': video_id,
1518 'url': video_url,
1519 'ext': 'mp4',
1520 'title': video_title,
1521 'description': video_description,
1522 'uploader': video_uploader
1523 }
1524
1525 return [info]
1526
1527
1528
1529 class TumblrIE(InfoExtractor):
1530 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1531
1532 def _real_extract(self, url):
1533 m_url = re.match(self._VALID_URL, url)
1534 video_id = m_url.group('id')
1535 blog = m_url.group('blog_name')
1536
1537 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1538 webpage = self._download_webpage(url, video_id)
1539
1540 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1541 video = re.search(re_video, webpage)
1542 if video is None:
1543 raise ExtractorError(u'Unable to extract video')
1544 video_url = video.group('video_url')
1545 ext = video.group('ext')
1546
1547 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1548 webpage, u'thumbnail', fatal=False) # We pick the first poster
1549 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1550
1551 # The only place where you can get a title, it's not complete,
1552 # but searching in other places doesn't work for all videos
1553 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1554 webpage, u'title', flags=re.DOTALL)
1555
1556 return [{'id': video_id,
1557 'url': video_url,
1558 'title': video_title,
1559 'thumbnail': video_thumbnail,
1560 'ext': ext
1561 }]
1562
1563 class BandcampIE(InfoExtractor):
1564 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1565
1566 def _real_extract(self, url):
1567 mobj = re.match(self._VALID_URL, url)
1568 title = mobj.group('title')
1569 webpage = self._download_webpage(url, title)
1570 # We get the link to the free download page
1571 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1572 if m_download is None:
1573 raise ExtractorError(u'No free songs found')
1574
1575 download_link = m_download.group(1)
1576 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1577 webpage, re.MULTILINE|re.DOTALL).group('id')
1578
1579 download_webpage = self._download_webpage(download_link, id,
1580 'Downloading free downloads page')
1581 # We get the dictionary of the track from some javascrip code
1582 info = re.search(r'items: (.*?),$',
1583 download_webpage, re.MULTILINE).group(1)
1584 info = json.loads(info)[0]
1585 # We pick mp3-320 for now, until format selection can be easily implemented.
1586 mp3_info = info[u'downloads'][u'mp3-320']
1587 # If we try to use this url it says the link has expired
1588 initial_url = mp3_info[u'url']
1589 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1590 m_url = re.match(re_url, initial_url)
1591 #We build the url we will use to get the final track url
1592 # This url is build in Bandcamp in the script download_bunde_*.js
1593 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1594 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1595 # If we could correctly generate the .rand field the url would be
1596 #in the "download_url" key
1597 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1598
1599 track_info = {'id':id,
1600 'title' : info[u'title'],
1601 'ext' : 'mp3',
1602 'url' : final_url,
1603 'thumbnail' : info[u'thumb_url'],
1604 'uploader' : info[u'artist']
1605 }
1606
1607 return [track_info]
1608
1609 class RedTubeIE(InfoExtractor):
1610 """Information Extractor for redtube"""
1611 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1612
1613 def _real_extract(self,url):
1614 mobj = re.match(self._VALID_URL, url)
1615 if mobj is None:
1616 raise ExtractorError(u'Invalid URL: %s' % url)
1617
1618 video_id = mobj.group('id')
1619 video_extension = 'mp4'
1620 webpage = self._download_webpage(url, video_id)
1621
1622 self.report_extraction(video_id)
1623
1624 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1625 webpage, u'video URL')
1626
1627 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1628 webpage, u'title')
1629
1630 return [{
1631 'id': video_id,
1632 'url': video_url,
1633 'ext': video_extension,
1634 'title': video_title,
1635 }]
1636
1637 class InaIE(InfoExtractor):
1638 """Information Extractor for Ina.fr"""
1639 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1640
1641 def _real_extract(self,url):
1642 mobj = re.match(self._VALID_URL, url)
1643
1644 video_id = mobj.group('id')
1645 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1646 video_extension = 'mp4'
1647 webpage = self._download_webpage(mrss_url, video_id)
1648
1649 self.report_extraction(video_id)
1650
1651 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1652 webpage, u'video URL')
1653
1654 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1655 webpage, u'title')
1656
1657 return [{
1658 'id': video_id,
1659 'url': video_url,
1660 'ext': video_extension,
1661 'title': video_title,
1662 }]
1663
1664 class HowcastIE(InfoExtractor):
1665 """Information Extractor for Howcast.com"""
1666 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1667
1668 def _real_extract(self, url):
1669 mobj = re.match(self._VALID_URL, url)
1670
1671 video_id = mobj.group('id')
1672 webpage_url = 'http://www.howcast.com/videos/' + video_id
1673 webpage = self._download_webpage(webpage_url, video_id)
1674
1675 self.report_extraction(video_id)
1676
1677 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1678 webpage, u'video URL')
1679
1680 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1681 webpage, u'title')
1682
1683 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1684 webpage, u'description', fatal=False)
1685
1686 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1687 webpage, u'thumbnail', fatal=False)
1688
1689 return [{
1690 'id': video_id,
1691 'url': video_url,
1692 'ext': 'mp4',
1693 'title': video_title,
1694 'description': video_description,
1695 'thumbnail': thumbnail,
1696 }]
1697
1698 class VineIE(InfoExtractor):
1699 """Information Extractor for Vine.co"""
1700 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1701
1702 def _real_extract(self, url):
1703 mobj = re.match(self._VALID_URL, url)
1704
1705 video_id = mobj.group('id')
1706 webpage_url = 'https://vine.co/v/' + video_id
1707 webpage = self._download_webpage(webpage_url, video_id)
1708
1709 self.report_extraction(video_id)
1710
1711 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1712 webpage, u'video URL')
1713
1714 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1715 webpage, u'title')
1716
1717 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1718 webpage, u'thumbnail', fatal=False)
1719
1720 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1721 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1722
1723 return [{
1724 'id': video_id,
1725 'url': video_url,
1726 'ext': 'mp4',
1727 'title': video_title,
1728 'thumbnail': thumbnail,
1729 'uploader': uploader,
1730 }]
1731
1732 class FlickrIE(InfoExtractor):
1733 """Information Extractor for Flickr videos"""
1734 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1735
1736 def _real_extract(self, url):
1737 mobj = re.match(self._VALID_URL, url)
1738
1739 video_id = mobj.group('id')
1740 video_uploader_id = mobj.group('uploader_id')
1741 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1742 webpage = self._download_webpage(webpage_url, video_id)
1743
1744 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1745
1746 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1747 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1748
1749 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1750 first_xml, u'node_id')
1751
1752 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1753 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1754
1755 self.report_extraction(video_id)
1756
1757 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1758 if mobj is None:
1759 raise ExtractorError(u'Unable to extract video url')
1760 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1761
1762 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1763 webpage, u'video title')
1764
1765 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1766 webpage, u'description', fatal=False)
1767
1768 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1769 webpage, u'thumbnail', fatal=False)
1770
1771 return [{
1772 'id': video_id,
1773 'url': video_url,
1774 'ext': 'mp4',
1775 'title': video_title,
1776 'description': video_description,
1777 'thumbnail': thumbnail,
1778 'uploader_id': video_uploader_id,
1779 }]
1780
1781 class TeamcocoIE(InfoExtractor):
1782 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1783
1784 def _real_extract(self, url):
1785 mobj = re.match(self._VALID_URL, url)
1786 if mobj is None:
1787 raise ExtractorError(u'Invalid URL: %s' % url)
1788 url_title = mobj.group('url_title')
1789 webpage = self._download_webpage(url, url_title)
1790
1791 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1792 webpage, u'video id')
1793
1794 self.report_extraction(video_id)
1795
1796 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1797 webpage, u'title')
1798
1799 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1800 webpage, u'thumbnail', fatal=False)
1801
1802 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1803 webpage, u'description', fatal=False)
1804
1805 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1806 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1807
1808 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1809 data, u'video URL')
1810
1811 return [{
1812 'id': video_id,
1813 'url': video_url,
1814 'ext': 'mp4',
1815 'title': video_title,
1816 'thumbnail': thumbnail,
1817 'description': video_description,
1818 }]
1819
1820 class XHamsterIE(InfoExtractor):
1821 """Information Extractor for xHamster"""
1822 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1823
1824 def _real_extract(self,url):
1825 mobj = re.match(self._VALID_URL, url)
1826
1827 video_id = mobj.group('id')
1828 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1829 webpage = self._download_webpage(mrss_url, video_id)
1830
1831 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1832 if mobj is None:
1833 raise ExtractorError(u'Unable to extract media URL')
1834 if len(mobj.group('server')) == 0:
1835 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1836 else:
1837 video_url = mobj.group('server')+'/key='+mobj.group('file')
1838 video_extension = video_url.split('.')[-1]
1839
1840 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1841 webpage, u'title')
1842
1843 # Can't see the description anywhere in the UI
1844 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1845 # webpage, u'description', fatal=False)
1846 # if video_description: video_description = unescapeHTML(video_description)
1847
1848 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1849 if mobj:
1850 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1851 else:
1852 video_upload_date = None
1853 self._downloader.report_warning(u'Unable to extract upload date')
1854
1855 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1856 webpage, u'uploader id', default=u'anonymous')
1857
1858 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1859 webpage, u'thumbnail', fatal=False)
1860
1861 return [{
1862 'id': video_id,
1863 'url': video_url,
1864 'ext': video_extension,
1865 'title': video_title,
1866 # 'description': video_description,
1867 'upload_date': video_upload_date,
1868 'uploader_id': video_uploader_id,
1869 'thumbnail': video_thumbnail
1870 }]
1871
1872 class HypemIE(InfoExtractor):
1873 """Information Extractor for hypem"""
1874 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1875
1876 def _real_extract(self, url):
1877 mobj = re.match(self._VALID_URL, url)
1878 if mobj is None:
1879 raise ExtractorError(u'Invalid URL: %s' % url)
1880 track_id = mobj.group(1)
1881
1882 data = { 'ax': 1, 'ts': time.time() }
1883 data_encoded = compat_urllib_parse.urlencode(data)
1884 complete_url = url + "?" + data_encoded
1885 request = compat_urllib_request.Request(complete_url)
1886 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1887 cookie = urlh.headers.get('Set-Cookie', '')
1888
1889 self.report_extraction(track_id)
1890
1891 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1892 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1893 try:
1894 track_list = json.loads(html_tracks)
1895 track = track_list[u'tracks'][0]
1896 except ValueError:
1897 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1898
1899 key = track[u"key"]
1900 track_id = track[u"id"]
1901 artist = track[u"artist"]
1902 title = track[u"song"]
1903
1904 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1905 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1906 request.add_header('cookie', cookie)
1907 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1908 try:
1909 song_data = json.loads(song_data_json)
1910 except ValueError:
1911 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1912 final_url = song_data[u"url"]
1913
1914 return [{
1915 'id': track_id,
1916 'url': final_url,
1917 'ext': "mp3",
1918 'title': title,
1919 'artist': artist,
1920 }]
1921
1922 class Vbox7IE(InfoExtractor):
1923 """Information Extractor for Vbox7"""
1924 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1925
1926 def _real_extract(self,url):
1927 mobj = re.match(self._VALID_URL, url)
1928 if mobj is None:
1929 raise ExtractorError(u'Invalid URL: %s' % url)
1930 video_id = mobj.group(1)
1931
1932 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1933 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1934 redirect_url = urlh.geturl() + new_location
1935 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1936
1937 title = self._html_search_regex(r'<title>(.*)</title>',
1938 webpage, u'title').split('/')[0].strip()
1939
1940 ext = "flv"
1941 info_url = "http://vbox7.com/play/magare.do"
1942 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1943 info_request = compat_urllib_request.Request(info_url, data)
1944 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1945 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1946 if info_response is None:
1947 raise ExtractorError(u'Unable to extract the media url')
1948 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1949
1950 return [{
1951 'id': video_id,
1952 'url': final_url,
1953 'ext': ext,
1954 'title': title,
1955 'thumbnail': thumbnail_url,
1956 }]
1957
1958
1959 def gen_extractors():
1960 """ Return a list of an instance of every supported extractor.
1961 The order does matter; the first extractor matched is the one handling the URL.
1962 """
1963 return [
1964 YoutubePlaylistIE(),
1965 YoutubeChannelIE(),
1966 YoutubeUserIE(),
1967 YoutubeSearchIE(),
1968 YoutubeIE(),
1969 MetacafeIE(),
1970 DailymotionIE(),
1971 GoogleSearchIE(),
1972 PhotobucketIE(),
1973 YahooIE(),
1974 YahooSearchIE(),
1975 DepositFilesIE(),
1976 FacebookIE(),
1977 BlipTVIE(),
1978 BlipTVUserIE(),
1979 VimeoIE(),
1980 MyVideoIE(),
1981 ComedyCentralIE(),
1982 EscapistIE(),
1983 CollegeHumorIE(),
1984 XVideosIE(),
1985 SoundcloudSetIE(),
1986 SoundcloudIE(),
1987 InfoQIE(),
1988 MixcloudIE(),
1989 StanfordOpenClassroomIE(),
1990 MTVIE(),
1991 YoukuIE(),
1992 XNXXIE(),
1993 YouJizzIE(),
1994 PornotubeIE(),
1995 YouPornIE(),
1996 GooglePlusIE(),
1997 ArteTvIE(),
1998 NBAIE(),
1999 WorldStarHipHopIE(),
2000 JustinTVIE(),
2001 FunnyOrDieIE(),
2002 SteamIE(),
2003 UstreamIE(),
2004 RBMARadioIE(),
2005 EightTracksIE(),
2006 KeekIE(),
2007 TEDIE(),
2008 MySpassIE(),
2009 SpiegelIE(),
2010 LiveLeakIE(),
2011 ARDIE(),
2012 ZDFIE(),
2013 TumblrIE(),
2014 BandcampIE(),
2015 RedTubeIE(),
2016 InaIE(),
2017 HowcastIE(),
2018 VineIE(),
2019 FlickrIE(),
2020 TeamcocoIE(),
2021 XHamsterIE(),
2022 HypemIE(),
2023 Vbox7IE(),
2024 GametrailersIE(),
2025 StatigramIE(),
2026 GenericIE()
2027 ]
2028
2029 def get_info_extractor(ie_name):
2030 """Returns the info extractor class with the given ie_name"""
2031 return globals()[ie_name+'IE']