]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Move NBA IE into its own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.myvideo import MyVideoIE
37 from .extractor.nba import NBAIE
38 from .extractor.statigram import StatigramIE
39 from .extractor.photobucket import PhotobucketIE
40 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
41 from .extractor.stanfordoc import StanfordOpenClassroomIE
42 from .extractor.vimeo import VimeoIE
43 from .extractor.xvideos import XVideosIE
44 from .extractor.yahoo import YahooIE, YahooSearchIE
45 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
46 from .extractor.zdf import ZDFIE
47
48
49
50 class MixcloudIE(InfoExtractor):
51 """Information extractor for www.mixcloud.com"""
52
53 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
54 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
55 IE_NAME = u'mixcloud'
56
57 def report_download_json(self, file_id):
58 """Report JSON download."""
59 self.to_screen(u'Downloading json')
60
61 def get_urls(self, jsonData, fmt, bitrate='best'):
62 """Get urls from 'audio_formats' section in json"""
63 file_url = None
64 try:
65 bitrate_list = jsonData[fmt]
66 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
67 bitrate = max(bitrate_list) # select highest
68
69 url_list = jsonData[fmt][bitrate]
70 except TypeError: # we have no bitrate info.
71 url_list = jsonData[fmt]
72 return url_list
73
74 def check_urls(self, url_list):
75 """Returns 1st active url from list"""
76 for url in url_list:
77 try:
78 compat_urllib_request.urlopen(url)
79 return url
80 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
81 url = None
82
83 return None
84
85 def _print_formats(self, formats):
86 print('Available formats:')
87 for fmt in formats.keys():
88 for b in formats[fmt]:
89 try:
90 ext = formats[fmt][b][0]
91 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
92 except TypeError: # we have no bitrate info
93 ext = formats[fmt][0]
94 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
95 break
96
97 def _real_extract(self, url):
98 mobj = re.match(self._VALID_URL, url)
99 if mobj is None:
100 raise ExtractorError(u'Invalid URL: %s' % url)
101 # extract uploader & filename from url
102 uploader = mobj.group(1).decode('utf-8')
103 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
104
105 # construct API request
106 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
107 # retrieve .json file with links to files
108 request = compat_urllib_request.Request(file_url)
109 try:
110 self.report_download_json(file_url)
111 jsonData = compat_urllib_request.urlopen(request).read()
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
114
115 # parse JSON
116 json_data = json.loads(jsonData)
117 player_url = json_data['player_swf_url']
118 formats = dict(json_data['audio_formats'])
119
120 req_format = self._downloader.params.get('format', None)
121 bitrate = None
122
123 if self._downloader.params.get('listformats', None):
124 self._print_formats(formats)
125 return
126
127 if req_format is None or req_format == 'best':
128 for format_param in formats.keys():
129 url_list = self.get_urls(formats, format_param)
130 # check urls
131 file_url = self.check_urls(url_list)
132 if file_url is not None:
133 break # got it!
134 else:
135 if req_format not in formats:
136 raise ExtractorError(u'Format is not available')
137
138 url_list = self.get_urls(formats, req_format)
139 file_url = self.check_urls(url_list)
140 format_param = req_format
141
142 return [{
143 'id': file_id.decode('utf-8'),
144 'url': file_url.decode('utf-8'),
145 'uploader': uploader.decode('utf-8'),
146 'upload_date': None,
147 'title': json_data['name'],
148 'ext': file_url.split('.')[-1].decode('utf-8'),
149 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
150 'thumbnail': json_data['thumbnail_url'],
151 'description': json_data['description'],
152 'player_url': player_url.decode('utf-8'),
153 }]
154
155
156 class MTVIE(InfoExtractor):
157 """Information extractor for MTV.com"""
158
159 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
160 IE_NAME = u'mtv'
161
162 def _real_extract(self, url):
163 mobj = re.match(self._VALID_URL, url)
164 if mobj is None:
165 raise ExtractorError(u'Invalid URL: %s' % url)
166 if not mobj.group('proto'):
167 url = 'http://' + url
168 video_id = mobj.group('videoid')
169
170 webpage = self._download_webpage(url, video_id)
171
172 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
173 webpage, u'song name', fatal=False)
174
175 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
176 webpage, u'title')
177
178 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
179 webpage, u'mtvn_uri', fatal=False)
180
181 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
182 webpage, u'content id', fatal=False)
183
184 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
185 self.report_extraction(video_id)
186 request = compat_urllib_request.Request(videogen_url)
187 try:
188 metadataXml = compat_urllib_request.urlopen(request).read()
189 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
190 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
191
192 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
193 renditions = mdoc.findall('.//rendition')
194
195 # For now, always pick the highest quality.
196 rendition = renditions[-1]
197
198 try:
199 _,_,ext = rendition.attrib['type'].partition('/')
200 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
201 video_url = rendition.find('./src').text
202 except KeyError:
203 raise ExtractorError('Invalid rendition field.')
204
205 info = {
206 'id': video_id,
207 'url': video_url,
208 'uploader': performer,
209 'upload_date': None,
210 'title': video_title,
211 'ext': ext,
212 'format': format,
213 }
214
215 return [info]
216
217
218 class YoukuIE(InfoExtractor):
219 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
220
221 def _gen_sid(self):
222 nowTime = int(time.time() * 1000)
223 random1 = random.randint(1000,1998)
224 random2 = random.randint(1000,9999)
225
226 return "%d%d%d" %(nowTime,random1,random2)
227
228 def _get_file_ID_mix_string(self, seed):
229 mixed = []
230 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
231 seed = float(seed)
232 for i in range(len(source)):
233 seed = (seed * 211 + 30031 ) % 65536
234 index = math.floor(seed / 65536 * len(source) )
235 mixed.append(source[int(index)])
236 source.remove(source[int(index)])
237 #return ''.join(mixed)
238 return mixed
239
240 def _get_file_id(self, fileId, seed):
241 mixed = self._get_file_ID_mix_string(seed)
242 ids = fileId.split('*')
243 realId = []
244 for ch in ids:
245 if ch:
246 realId.append(mixed[int(ch)])
247 return ''.join(realId)
248
249 def _real_extract(self, url):
250 mobj = re.match(self._VALID_URL, url)
251 if mobj is None:
252 raise ExtractorError(u'Invalid URL: %s' % url)
253 video_id = mobj.group('ID')
254
255 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
256
257 jsondata = self._download_webpage(info_url, video_id)
258
259 self.report_extraction(video_id)
260 try:
261 config = json.loads(jsondata)
262
263 video_title = config['data'][0]['title']
264 seed = config['data'][0]['seed']
265
266 format = self._downloader.params.get('format', None)
267 supported_format = list(config['data'][0]['streamfileids'].keys())
268
269 if format is None or format == 'best':
270 if 'hd2' in supported_format:
271 format = 'hd2'
272 else:
273 format = 'flv'
274 ext = u'flv'
275 elif format == 'worst':
276 format = 'mp4'
277 ext = u'mp4'
278 else:
279 format = 'flv'
280 ext = u'flv'
281
282
283 fileid = config['data'][0]['streamfileids'][format]
284 keys = [s['k'] for s in config['data'][0]['segs'][format]]
285 except (UnicodeDecodeError, ValueError, KeyError):
286 raise ExtractorError(u'Unable to extract info section')
287
288 files_info=[]
289 sid = self._gen_sid()
290 fileid = self._get_file_id(fileid, seed)
291
292 #column 8,9 of fileid represent the segment number
293 #fileid[7:9] should be changed
294 for index, key in enumerate(keys):
295
296 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
297 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
298
299 info = {
300 'id': '%s_part%02d' % (video_id, index),
301 'url': download_url,
302 'uploader': None,
303 'upload_date': None,
304 'title': video_title,
305 'ext': ext,
306 }
307 files_info.append(info)
308
309 return files_info
310
311
312 class XNXXIE(InfoExtractor):
313 """Information extractor for xnxx.com"""
314
315 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
316 IE_NAME = u'xnxx'
317 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
318 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
319 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
320
321 def _real_extract(self, url):
322 mobj = re.match(self._VALID_URL, url)
323 if mobj is None:
324 raise ExtractorError(u'Invalid URL: %s' % url)
325 video_id = mobj.group(1)
326
327 # Get webpage content
328 webpage = self._download_webpage(url, video_id)
329
330 video_url = self._search_regex(self.VIDEO_URL_RE,
331 webpage, u'video URL')
332 video_url = compat_urllib_parse.unquote(video_url)
333
334 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
335 webpage, u'title')
336
337 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
338 webpage, u'thumbnail', fatal=False)
339
340 return [{
341 'id': video_id,
342 'url': video_url,
343 'uploader': None,
344 'upload_date': None,
345 'title': video_title,
346 'ext': 'flv',
347 'thumbnail': video_thumbnail,
348 'description': None,
349 }]
350
351
352
353
354 class JustinTVIE(InfoExtractor):
355 """Information extractor for justin.tv and twitch.tv"""
356 # TODO: One broadcast may be split into multiple videos. The key
357 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
358 # starts at 1 and increases. Can we treat all parts as one video?
359
360 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
361 (?:
362 (?P<channelid>[^/]+)|
363 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
364 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
365 )
366 /?(?:\#.*)?$
367 """
368 _JUSTIN_PAGE_LIMIT = 100
369 IE_NAME = u'justin.tv'
370
371 def report_download_page(self, channel, offset):
372 """Report attempt to download a single page of videos."""
373 self.to_screen(u'%s: Downloading video information from %d to %d' %
374 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
375
376 # Return count of items, list of *valid* items
377 def _parse_page(self, url, video_id):
378 webpage = self._download_webpage(url, video_id,
379 u'Downloading video info JSON',
380 u'unable to download video info JSON')
381
382 response = json.loads(webpage)
383 if type(response) != list:
384 error_text = response.get('error', 'unknown error')
385 raise ExtractorError(u'Justin.tv API: %s' % error_text)
386 info = []
387 for clip in response:
388 video_url = clip['video_file_url']
389 if video_url:
390 video_extension = os.path.splitext(video_url)[1][1:]
391 video_date = re.sub('-', '', clip['start_time'][:10])
392 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
393 video_id = clip['id']
394 video_title = clip.get('title', video_id)
395 info.append({
396 'id': video_id,
397 'url': video_url,
398 'title': video_title,
399 'uploader': clip.get('channel_name', video_uploader_id),
400 'uploader_id': video_uploader_id,
401 'upload_date': video_date,
402 'ext': video_extension,
403 })
404 return (len(response), info)
405
406 def _real_extract(self, url):
407 mobj = re.match(self._VALID_URL, url)
408 if mobj is None:
409 raise ExtractorError(u'invalid URL: %s' % url)
410
411 api_base = 'http://api.justin.tv'
412 paged = False
413 if mobj.group('channelid'):
414 paged = True
415 video_id = mobj.group('channelid')
416 api = api_base + '/channel/archives/%s.json' % video_id
417 elif mobj.group('chapterid'):
418 chapter_id = mobj.group('chapterid')
419
420 webpage = self._download_webpage(url, chapter_id)
421 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
422 if not m:
423 raise ExtractorError(u'Cannot find archive of a chapter')
424 archive_id = m.group(1)
425
426 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
427 chapter_info_xml = self._download_webpage(api, chapter_id,
428 note=u'Downloading chapter information',
429 errnote=u'Chapter information download failed')
430 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
431 for a in doc.findall('.//archive'):
432 if archive_id == a.find('./id').text:
433 break
434 else:
435 raise ExtractorError(u'Could not find chapter in chapter information')
436
437 video_url = a.find('./video_file_url').text
438 video_ext = video_url.rpartition('.')[2] or u'flv'
439
440 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
441 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
442 note='Downloading chapter metadata',
443 errnote='Download of chapter metadata failed')
444 chapter_info = json.loads(chapter_info_json)
445
446 bracket_start = int(doc.find('.//bracket_start').text)
447 bracket_end = int(doc.find('.//bracket_end').text)
448
449 # TODO determine start (and probably fix up file)
450 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
451 #video_url += u'?start=' + TODO:start_timestamp
452 # bracket_start is 13290, but we want 51670615
453 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
454 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
455
456 info = {
457 'id': u'c' + chapter_id,
458 'url': video_url,
459 'ext': video_ext,
460 'title': chapter_info['title'],
461 'thumbnail': chapter_info['preview'],
462 'description': chapter_info['description'],
463 'uploader': chapter_info['channel']['display_name'],
464 'uploader_id': chapter_info['channel']['name'],
465 }
466 return [info]
467 else:
468 video_id = mobj.group('videoid')
469 api = api_base + '/broadcast/by_archive/%s.json' % video_id
470
471 self.report_extraction(video_id)
472
473 info = []
474 offset = 0
475 limit = self._JUSTIN_PAGE_LIMIT
476 while True:
477 if paged:
478 self.report_download_page(video_id, offset)
479 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
480 page_count, page_info = self._parse_page(page_url, video_id)
481 info.extend(page_info)
482 if not paged or page_count != limit:
483 break
484 offset += limit
485 return info
486
487 class FunnyOrDieIE(InfoExtractor):
488 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
489
490 def _real_extract(self, url):
491 mobj = re.match(self._VALID_URL, url)
492 if mobj is None:
493 raise ExtractorError(u'invalid URL: %s' % url)
494
495 video_id = mobj.group('id')
496 webpage = self._download_webpage(url, video_id)
497
498 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
499 webpage, u'video URL', flags=re.DOTALL)
500
501 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
502 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
503
504 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
505 webpage, u'description', fatal=False, flags=re.DOTALL)
506
507 info = {
508 'id': video_id,
509 'url': video_url,
510 'ext': 'mp4',
511 'title': title,
512 'description': video_description,
513 }
514 return [info]
515
516 class SteamIE(InfoExtractor):
517 _VALID_URL = r"""http://store\.steampowered\.com/
518 (agecheck/)?
519 (?P<urltype>video|app)/ #If the page is only for videos or for a game
520 (?P<gameID>\d+)/?
521 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
522 """
523 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
524 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
525
526 @classmethod
527 def suitable(cls, url):
528 """Receives a URL and returns True if suitable for this IE."""
529 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
530
531 def _real_extract(self, url):
532 m = re.match(self._VALID_URL, url, re.VERBOSE)
533 gameID = m.group('gameID')
534
535 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
536 webpage = self._download_webpage(videourl, gameID)
537
538 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
539 videourl = self._AGECHECK_TEMPLATE % gameID
540 self.report_age_confirmation()
541 webpage = self._download_webpage(videourl, gameID)
542
543 self.report_extraction(gameID)
544 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
545 webpage, 'game title')
546
547 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
548 mweb = re.finditer(urlRE, webpage)
549 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
550 titles = re.finditer(namesRE, webpage)
551 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
552 thumbs = re.finditer(thumbsRE, webpage)
553 videos = []
554 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
555 video_id = vid.group('videoID')
556 title = vtitle.group('videoName')
557 video_url = vid.group('videoURL')
558 video_thumb = thumb.group('thumbnail')
559 if not video_url:
560 raise ExtractorError(u'Cannot find video url for %s' % video_id)
561 info = {
562 'id':video_id,
563 'url':video_url,
564 'ext': 'flv',
565 'title': unescapeHTML(title),
566 'thumbnail': video_thumb
567 }
568 videos.append(info)
569 return [self.playlist_result(videos, gameID, game_title)]
570
571 class UstreamIE(InfoExtractor):
572 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
573 IE_NAME = u'ustream'
574
575 def _real_extract(self, url):
576 m = re.match(self._VALID_URL, url)
577 video_id = m.group('videoID')
578
579 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
580 webpage = self._download_webpage(url, video_id)
581
582 self.report_extraction(video_id)
583
584 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
585 webpage, u'title')
586
587 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
588 webpage, u'uploader', fatal=False, flags=re.DOTALL)
589
590 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
591 webpage, u'thumbnail', fatal=False)
592
593 info = {
594 'id': video_id,
595 'url': video_url,
596 'ext': 'flv',
597 'title': video_title,
598 'uploader': uploader,
599 'thumbnail': thumbnail,
600 }
601 return info
602
603 class WorldStarHipHopIE(InfoExtractor):
604 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
605 IE_NAME = u'WorldStarHipHop'
606
607 def _real_extract(self, url):
608 m = re.match(self._VALID_URL, url)
609 video_id = m.group('id')
610
611 webpage_src = self._download_webpage(url, video_id)
612
613 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
614 webpage_src, u'video URL')
615
616 if 'mp4' in video_url:
617 ext = 'mp4'
618 else:
619 ext = 'flv'
620
621 video_title = self._html_search_regex(r"<title>(.*)</title>",
622 webpage_src, u'title')
623
624 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
625 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
626 webpage_src, u'thumbnail', fatal=False)
627
628 if not thumbnail:
629 _title = r"""candytitles.*>(.*)</span>"""
630 mobj = re.search(_title, webpage_src)
631 if mobj is not None:
632 video_title = mobj.group(1)
633
634 results = [{
635 'id': video_id,
636 'url' : video_url,
637 'title' : video_title,
638 'thumbnail' : thumbnail,
639 'ext' : ext,
640 }]
641 return results
642
643 class RBMARadioIE(InfoExtractor):
644 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
645
646 def _real_extract(self, url):
647 m = re.match(self._VALID_URL, url)
648 video_id = m.group('videoID')
649
650 webpage = self._download_webpage(url, video_id)
651
652 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
653 webpage, u'json data', flags=re.MULTILINE)
654
655 try:
656 data = json.loads(json_data)
657 except ValueError as e:
658 raise ExtractorError(u'Invalid JSON: ' + str(e))
659
660 video_url = data['akamai_url'] + '&cbr=256'
661 url_parts = compat_urllib_parse_urlparse(video_url)
662 video_ext = url_parts.path.rpartition('.')[2]
663 info = {
664 'id': video_id,
665 'url': video_url,
666 'ext': video_ext,
667 'title': data['title'],
668 'description': data.get('teaser_text'),
669 'location': data.get('country_of_origin'),
670 'uploader': data.get('host', {}).get('name'),
671 'uploader_id': data.get('host', {}).get('slug'),
672 'thumbnail': data.get('image', {}).get('large_url_2x'),
673 'duration': data.get('duration'),
674 }
675 return [info]
676
677
678 class YouPornIE(InfoExtractor):
679 """Information extractor for youporn.com."""
680 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
681
682 def _print_formats(self, formats):
683 """Print all available formats"""
684 print(u'Available formats:')
685 print(u'ext\t\tformat')
686 print(u'---------------------------------')
687 for format in formats:
688 print(u'%s\t\t%s' % (format['ext'], format['format']))
689
690 def _specific(self, req_format, formats):
691 for x in formats:
692 if(x["format"]==req_format):
693 return x
694 return None
695
696 def _real_extract(self, url):
697 mobj = re.match(self._VALID_URL, url)
698 if mobj is None:
699 raise ExtractorError(u'Invalid URL: %s' % url)
700 video_id = mobj.group('videoid')
701
702 req = compat_urllib_request.Request(url)
703 req.add_header('Cookie', 'age_verified=1')
704 webpage = self._download_webpage(req, video_id)
705
706 # Get JSON parameters
707 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
708 try:
709 params = json.loads(json_params)
710 except:
711 raise ExtractorError(u'Invalid JSON')
712
713 self.report_extraction(video_id)
714 try:
715 video_title = params['title']
716 upload_date = unified_strdate(params['release_date_f'])
717 video_description = params['description']
718 video_uploader = params['submitted_by']
719 thumbnail = params['thumbnails'][0]['image']
720 except KeyError:
721 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
722
723 # Get all of the formats available
724 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
725 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
726 webpage, u'download list').strip()
727
728 # Get all of the links from the page
729 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
730 links = re.findall(LINK_RE, download_list_html)
731 if(len(links) == 0):
732 raise ExtractorError(u'ERROR: no known formats available for video')
733
734 self.to_screen(u'Links found: %d' % len(links))
735
736 formats = []
737 for link in links:
738
739 # A link looks like this:
740 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
741 # A path looks like this:
742 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
743 video_url = unescapeHTML( link )
744 path = compat_urllib_parse_urlparse( video_url ).path
745 extension = os.path.splitext( path )[1][1:]
746 format = path.split('/')[4].split('_')[:2]
747 size = format[0]
748 bitrate = format[1]
749 format = "-".join( format )
750 # title = u'%s-%s-%s' % (video_title, size, bitrate)
751
752 formats.append({
753 'id': video_id,
754 'url': video_url,
755 'uploader': video_uploader,
756 'upload_date': upload_date,
757 'title': video_title,
758 'ext': extension,
759 'format': format,
760 'thumbnail': thumbnail,
761 'description': video_description
762 })
763
764 if self._downloader.params.get('listformats', None):
765 self._print_formats(formats)
766 return
767
768 req_format = self._downloader.params.get('format', None)
769 self.to_screen(u'Format: %s' % req_format)
770
771 if req_format is None or req_format == 'best':
772 return [formats[0]]
773 elif req_format == 'worst':
774 return [formats[-1]]
775 elif req_format in ('-1', 'all'):
776 return formats
777 else:
778 format = self._specific( req_format, formats )
779 if result is None:
780 raise ExtractorError(u'Requested format not available')
781 return [format]
782
783
784
785 class PornotubeIE(InfoExtractor):
786 """Information extractor for pornotube.com."""
787 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
788
789 def _real_extract(self, url):
790 mobj = re.match(self._VALID_URL, url)
791 if mobj is None:
792 raise ExtractorError(u'Invalid URL: %s' % url)
793
794 video_id = mobj.group('videoid')
795 video_title = mobj.group('title')
796
797 # Get webpage content
798 webpage = self._download_webpage(url, video_id)
799
800 # Get the video URL
801 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
802 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
803 video_url = compat_urllib_parse.unquote(video_url)
804
805 #Get the uploaded date
806 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
807 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
808 if upload_date: upload_date = unified_strdate(upload_date)
809
810 info = {'id': video_id,
811 'url': video_url,
812 'uploader': None,
813 'upload_date': upload_date,
814 'title': video_title,
815 'ext': 'flv',
816 'format': 'flv'}
817
818 return [info]
819
820 class YouJizzIE(InfoExtractor):
821 """Information extractor for youjizz.com."""
822 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
823
824 def _real_extract(self, url):
825 mobj = re.match(self._VALID_URL, url)
826 if mobj is None:
827 raise ExtractorError(u'Invalid URL: %s' % url)
828
829 video_id = mobj.group('videoid')
830
831 # Get webpage content
832 webpage = self._download_webpage(url, video_id)
833
834 # Get the video title
835 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
836 webpage, u'title').strip()
837
838 # Get the embed page
839 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
840 if result is None:
841 raise ExtractorError(u'ERROR: unable to extract embed page')
842
843 embed_page_url = result.group(0).strip()
844 video_id = result.group('videoid')
845
846 webpage = self._download_webpage(embed_page_url, video_id)
847
848 # Get the video URL
849 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
850 webpage, u'video URL')
851
852 info = {'id': video_id,
853 'url': video_url,
854 'title': video_title,
855 'ext': 'flv',
856 'format': 'flv',
857 'player_url': embed_page_url}
858
859 return [info]
860
861 class EightTracksIE(InfoExtractor):
862 IE_NAME = '8tracks'
863 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
864
865 def _real_extract(self, url):
866 mobj = re.match(self._VALID_URL, url)
867 if mobj is None:
868 raise ExtractorError(u'Invalid URL: %s' % url)
869 playlist_id = mobj.group('id')
870
871 webpage = self._download_webpage(url, playlist_id)
872
873 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
874 data = json.loads(json_like)
875
876 session = str(random.randint(0, 1000000000))
877 mix_id = data['id']
878 track_count = data['tracks_count']
879 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
880 next_url = first_url
881 res = []
882 for i in itertools.count():
883 api_json = self._download_webpage(next_url, playlist_id,
884 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
885 errnote=u'Failed to download song information')
886 api_data = json.loads(api_json)
887 track_data = api_data[u'set']['track']
888 info = {
889 'id': track_data['id'],
890 'url': track_data['track_file_stream_url'],
891 'title': track_data['performer'] + u' - ' + track_data['name'],
892 'raw_title': track_data['name'],
893 'uploader_id': data['user']['login'],
894 'ext': 'm4a',
895 }
896 res.append(info)
897 if api_data['set']['at_last_track']:
898 break
899 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
900 return res
901
902 class KeekIE(InfoExtractor):
903 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
904 IE_NAME = u'keek'
905
906 def _real_extract(self, url):
907 m = re.match(self._VALID_URL, url)
908 video_id = m.group('videoID')
909
910 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
911 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
912 webpage = self._download_webpage(url, video_id)
913
914 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
915 webpage, u'title')
916
917 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
918 webpage, u'uploader', fatal=False)
919
920 info = {
921 'id': video_id,
922 'url': video_url,
923 'ext': 'mp4',
924 'title': video_title,
925 'thumbnail': thumbnail,
926 'uploader': uploader
927 }
928 return [info]
929
930 class TEDIE(InfoExtractor):
931 _VALID_URL=r'''http://www\.ted\.com/
932 (
933 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
934 |
935 ((?P<type_talk>talks)) # We have a simple talk
936 )
937 (/lang/(.*?))? # The url may contain the language
938 /(?P<name>\w+) # Here goes the name and then ".html"
939 '''
940
941 @classmethod
942 def suitable(cls, url):
943 """Receives a URL and returns True if suitable for this IE."""
944 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
945
946 def _real_extract(self, url):
947 m=re.match(self._VALID_URL, url, re.VERBOSE)
948 if m.group('type_talk'):
949 return [self._talk_info(url)]
950 else :
951 playlist_id=m.group('playlist_id')
952 name=m.group('name')
953 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
954 return [self._playlist_videos_info(url,name,playlist_id)]
955
956 def _playlist_videos_info(self,url,name,playlist_id=0):
957 '''Returns the videos of the playlist'''
958 video_RE=r'''
959 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
960 ([.\s]*?)data-playlist_item_id="(\d+)"
961 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
962 '''
963 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
964 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
965 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
966 m_names=re.finditer(video_name_RE,webpage)
967
968 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
969 webpage, 'playlist title')
970
971 playlist_entries = []
972 for m_video, m_name in zip(m_videos,m_names):
973 video_id=m_video.group('video_id')
974 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
975 playlist_entries.append(self.url_result(talk_url, 'TED'))
976 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
977
978 def _talk_info(self, url, video_id=0):
979 """Return the video for the talk in the url"""
980 m = re.match(self._VALID_URL, url,re.VERBOSE)
981 video_name = m.group('name')
982 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
983 self.report_extraction(video_name)
984 # If the url includes the language we get the title translated
985 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
986 webpage, 'title')
987 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
988 webpage, 'json data')
989 info = json.loads(json_data)
990 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
991 webpage, 'description', flags = re.DOTALL)
992
993 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
994 webpage, 'thumbnail')
995 info = {
996 'id': info['id'],
997 'url': info['htmlStreams'][-1]['file'],
998 'ext': 'mp4',
999 'title': title,
1000 'thumbnail': thumbnail,
1001 'description': desc,
1002 }
1003 return info
1004
1005 class MySpassIE(InfoExtractor):
1006 _VALID_URL = r'http://www.myspass.de/.*'
1007
1008 def _real_extract(self, url):
1009 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1010
1011 # video id is the last path element of the URL
1012 # usually there is a trailing slash, so also try the second but last
1013 url_path = compat_urllib_parse_urlparse(url).path
1014 url_parent_path, video_id = os.path.split(url_path)
1015 if not video_id:
1016 _, video_id = os.path.split(url_parent_path)
1017
1018 # get metadata
1019 metadata_url = META_DATA_URL_TEMPLATE % video_id
1020 metadata_text = self._download_webpage(metadata_url, video_id)
1021 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1022
1023 # extract values from metadata
1024 url_flv_el = metadata.find('url_flv')
1025 if url_flv_el is None:
1026 raise ExtractorError(u'Unable to extract download url')
1027 video_url = url_flv_el.text
1028 extension = os.path.splitext(video_url)[1][1:]
1029 title_el = metadata.find('title')
1030 if title_el is None:
1031 raise ExtractorError(u'Unable to extract title')
1032 title = title_el.text
1033 format_id_el = metadata.find('format_id')
1034 if format_id_el is None:
1035 format = ext
1036 else:
1037 format = format_id_el.text
1038 description_el = metadata.find('description')
1039 if description_el is not None:
1040 description = description_el.text
1041 else:
1042 description = None
1043 imagePreview_el = metadata.find('imagePreview')
1044 if imagePreview_el is not None:
1045 thumbnail = imagePreview_el.text
1046 else:
1047 thumbnail = None
1048 info = {
1049 'id': video_id,
1050 'url': video_url,
1051 'title': title,
1052 'ext': extension,
1053 'format': format,
1054 'thumbnail': thumbnail,
1055 'description': description
1056 }
1057 return [info]
1058
1059 class SpiegelIE(InfoExtractor):
1060 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1061
1062 def _real_extract(self, url):
1063 m = re.match(self._VALID_URL, url)
1064 video_id = m.group('videoID')
1065
1066 webpage = self._download_webpage(url, video_id)
1067
1068 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1069 webpage, u'title')
1070
1071 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1072 xml_code = self._download_webpage(xml_url, video_id,
1073 note=u'Downloading XML', errnote=u'Failed to download XML')
1074
1075 idoc = xml.etree.ElementTree.fromstring(xml_code)
1076 last_type = idoc[-1]
1077 filename = last_type.findall('./filename')[0].text
1078 duration = float(last_type.findall('./duration')[0].text)
1079
1080 video_url = 'http://video2.spiegel.de/flash/' + filename
1081 video_ext = filename.rpartition('.')[2]
1082 info = {
1083 'id': video_id,
1084 'url': video_url,
1085 'ext': video_ext,
1086 'title': video_title,
1087 'duration': duration,
1088 }
1089 return [info]
1090
1091 class LiveLeakIE(InfoExtractor):
1092
1093 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1094 IE_NAME = u'liveleak'
1095
1096 def _real_extract(self, url):
1097 mobj = re.match(self._VALID_URL, url)
1098 if mobj is None:
1099 raise ExtractorError(u'Invalid URL: %s' % url)
1100
1101 video_id = mobj.group('video_id')
1102
1103 webpage = self._download_webpage(url, video_id)
1104
1105 video_url = self._search_regex(r'file: "(.*?)",',
1106 webpage, u'video URL')
1107
1108 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1109 webpage, u'title').replace('LiveLeak.com -', '').strip()
1110
1111 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1112 webpage, u'description', fatal=False)
1113
1114 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1115 webpage, u'uploader', fatal=False)
1116
1117 info = {
1118 'id': video_id,
1119 'url': video_url,
1120 'ext': 'mp4',
1121 'title': video_title,
1122 'description': video_description,
1123 'uploader': video_uploader
1124 }
1125
1126 return [info]
1127
1128
1129
1130 class TumblrIE(InfoExtractor):
1131 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1132
1133 def _real_extract(self, url):
1134 m_url = re.match(self._VALID_URL, url)
1135 video_id = m_url.group('id')
1136 blog = m_url.group('blog_name')
1137
1138 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1139 webpage = self._download_webpage(url, video_id)
1140
1141 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1142 video = re.search(re_video, webpage)
1143 if video is None:
1144 raise ExtractorError(u'Unable to extract video')
1145 video_url = video.group('video_url')
1146 ext = video.group('ext')
1147
1148 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1149 webpage, u'thumbnail', fatal=False) # We pick the first poster
1150 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1151
1152 # The only place where you can get a title, it's not complete,
1153 # but searching in other places doesn't work for all videos
1154 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1155 webpage, u'title', flags=re.DOTALL)
1156
1157 return [{'id': video_id,
1158 'url': video_url,
1159 'title': video_title,
1160 'thumbnail': video_thumbnail,
1161 'ext': ext
1162 }]
1163
1164 class BandcampIE(InfoExtractor):
1165 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1166
1167 def _real_extract(self, url):
1168 mobj = re.match(self._VALID_URL, url)
1169 title = mobj.group('title')
1170 webpage = self._download_webpage(url, title)
1171 # We get the link to the free download page
1172 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1173 if m_download is None:
1174 raise ExtractorError(u'No free songs found')
1175
1176 download_link = m_download.group(1)
1177 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1178 webpage, re.MULTILINE|re.DOTALL).group('id')
1179
1180 download_webpage = self._download_webpage(download_link, id,
1181 'Downloading free downloads page')
1182 # We get the dictionary of the track from some javascrip code
1183 info = re.search(r'items: (.*?),$',
1184 download_webpage, re.MULTILINE).group(1)
1185 info = json.loads(info)[0]
1186 # We pick mp3-320 for now, until format selection can be easily implemented.
1187 mp3_info = info[u'downloads'][u'mp3-320']
1188 # If we try to use this url it says the link has expired
1189 initial_url = mp3_info[u'url']
1190 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1191 m_url = re.match(re_url, initial_url)
1192 #We build the url we will use to get the final track url
1193 # This url is build in Bandcamp in the script download_bunde_*.js
1194 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1195 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1196 # If we could correctly generate the .rand field the url would be
1197 #in the "download_url" key
1198 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1199
1200 track_info = {'id':id,
1201 'title' : info[u'title'],
1202 'ext' : 'mp3',
1203 'url' : final_url,
1204 'thumbnail' : info[u'thumb_url'],
1205 'uploader' : info[u'artist']
1206 }
1207
1208 return [track_info]
1209
1210 class RedTubeIE(InfoExtractor):
1211 """Information Extractor for redtube"""
1212 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1213
1214 def _real_extract(self,url):
1215 mobj = re.match(self._VALID_URL, url)
1216 if mobj is None:
1217 raise ExtractorError(u'Invalid URL: %s' % url)
1218
1219 video_id = mobj.group('id')
1220 video_extension = 'mp4'
1221 webpage = self._download_webpage(url, video_id)
1222
1223 self.report_extraction(video_id)
1224
1225 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1226 webpage, u'video URL')
1227
1228 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1229 webpage, u'title')
1230
1231 return [{
1232 'id': video_id,
1233 'url': video_url,
1234 'ext': video_extension,
1235 'title': video_title,
1236 }]
1237
1238 class InaIE(InfoExtractor):
1239 """Information Extractor for Ina.fr"""
1240 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1241
1242 def _real_extract(self,url):
1243 mobj = re.match(self._VALID_URL, url)
1244
1245 video_id = mobj.group('id')
1246 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1247 video_extension = 'mp4'
1248 webpage = self._download_webpage(mrss_url, video_id)
1249
1250 self.report_extraction(video_id)
1251
1252 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1253 webpage, u'video URL')
1254
1255 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1256 webpage, u'title')
1257
1258 return [{
1259 'id': video_id,
1260 'url': video_url,
1261 'ext': video_extension,
1262 'title': video_title,
1263 }]
1264
1265 class HowcastIE(InfoExtractor):
1266 """Information Extractor for Howcast.com"""
1267 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1268
1269 def _real_extract(self, url):
1270 mobj = re.match(self._VALID_URL, url)
1271
1272 video_id = mobj.group('id')
1273 webpage_url = 'http://www.howcast.com/videos/' + video_id
1274 webpage = self._download_webpage(webpage_url, video_id)
1275
1276 self.report_extraction(video_id)
1277
1278 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1279 webpage, u'video URL')
1280
1281 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1282 webpage, u'title')
1283
1284 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1285 webpage, u'description', fatal=False)
1286
1287 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1288 webpage, u'thumbnail', fatal=False)
1289
1290 return [{
1291 'id': video_id,
1292 'url': video_url,
1293 'ext': 'mp4',
1294 'title': video_title,
1295 'description': video_description,
1296 'thumbnail': thumbnail,
1297 }]
1298
1299 class VineIE(InfoExtractor):
1300 """Information Extractor for Vine.co"""
1301 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1302
1303 def _real_extract(self, url):
1304 mobj = re.match(self._VALID_URL, url)
1305
1306 video_id = mobj.group('id')
1307 webpage_url = 'https://vine.co/v/' + video_id
1308 webpage = self._download_webpage(webpage_url, video_id)
1309
1310 self.report_extraction(video_id)
1311
1312 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1313 webpage, u'video URL')
1314
1315 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1316 webpage, u'title')
1317
1318 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1319 webpage, u'thumbnail', fatal=False)
1320
1321 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1322 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1323
1324 return [{
1325 'id': video_id,
1326 'url': video_url,
1327 'ext': 'mp4',
1328 'title': video_title,
1329 'thumbnail': thumbnail,
1330 'uploader': uploader,
1331 }]
1332
1333 class FlickrIE(InfoExtractor):
1334 """Information Extractor for Flickr videos"""
1335 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1336
1337 def _real_extract(self, url):
1338 mobj = re.match(self._VALID_URL, url)
1339
1340 video_id = mobj.group('id')
1341 video_uploader_id = mobj.group('uploader_id')
1342 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1343 webpage = self._download_webpage(webpage_url, video_id)
1344
1345 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1346
1347 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1348 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1349
1350 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1351 first_xml, u'node_id')
1352
1353 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1354 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1355
1356 self.report_extraction(video_id)
1357
1358 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1359 if mobj is None:
1360 raise ExtractorError(u'Unable to extract video url')
1361 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1362
1363 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1364 webpage, u'video title')
1365
1366 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1367 webpage, u'description', fatal=False)
1368
1369 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1370 webpage, u'thumbnail', fatal=False)
1371
1372 return [{
1373 'id': video_id,
1374 'url': video_url,
1375 'ext': 'mp4',
1376 'title': video_title,
1377 'description': video_description,
1378 'thumbnail': thumbnail,
1379 'uploader_id': video_uploader_id,
1380 }]
1381
1382 class TeamcocoIE(InfoExtractor):
1383 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1384
1385 def _real_extract(self, url):
1386 mobj = re.match(self._VALID_URL, url)
1387 if mobj is None:
1388 raise ExtractorError(u'Invalid URL: %s' % url)
1389 url_title = mobj.group('url_title')
1390 webpage = self._download_webpage(url, url_title)
1391
1392 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1393 webpage, u'video id')
1394
1395 self.report_extraction(video_id)
1396
1397 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1398 webpage, u'title')
1399
1400 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1401 webpage, u'thumbnail', fatal=False)
1402
1403 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1404 webpage, u'description', fatal=False)
1405
1406 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1407 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1408
1409 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1410 data, u'video URL')
1411
1412 return [{
1413 'id': video_id,
1414 'url': video_url,
1415 'ext': 'mp4',
1416 'title': video_title,
1417 'thumbnail': thumbnail,
1418 'description': video_description,
1419 }]
1420
1421 class XHamsterIE(InfoExtractor):
1422 """Information Extractor for xHamster"""
1423 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1424
1425 def _real_extract(self,url):
1426 mobj = re.match(self._VALID_URL, url)
1427
1428 video_id = mobj.group('id')
1429 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1430 webpage = self._download_webpage(mrss_url, video_id)
1431
1432 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1433 if mobj is None:
1434 raise ExtractorError(u'Unable to extract media URL')
1435 if len(mobj.group('server')) == 0:
1436 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1437 else:
1438 video_url = mobj.group('server')+'/key='+mobj.group('file')
1439 video_extension = video_url.split('.')[-1]
1440
1441 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1442 webpage, u'title')
1443
1444 # Can't see the description anywhere in the UI
1445 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1446 # webpage, u'description', fatal=False)
1447 # if video_description: video_description = unescapeHTML(video_description)
1448
1449 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1450 if mobj:
1451 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1452 else:
1453 video_upload_date = None
1454 self._downloader.report_warning(u'Unable to extract upload date')
1455
1456 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1457 webpage, u'uploader id', default=u'anonymous')
1458
1459 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1460 webpage, u'thumbnail', fatal=False)
1461
1462 return [{
1463 'id': video_id,
1464 'url': video_url,
1465 'ext': video_extension,
1466 'title': video_title,
1467 # 'description': video_description,
1468 'upload_date': video_upload_date,
1469 'uploader_id': video_uploader_id,
1470 'thumbnail': video_thumbnail
1471 }]
1472
1473 class HypemIE(InfoExtractor):
1474 """Information Extractor for hypem"""
1475 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1476
1477 def _real_extract(self, url):
1478 mobj = re.match(self._VALID_URL, url)
1479 if mobj is None:
1480 raise ExtractorError(u'Invalid URL: %s' % url)
1481 track_id = mobj.group(1)
1482
1483 data = { 'ax': 1, 'ts': time.time() }
1484 data_encoded = compat_urllib_parse.urlencode(data)
1485 complete_url = url + "?" + data_encoded
1486 request = compat_urllib_request.Request(complete_url)
1487 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1488 cookie = urlh.headers.get('Set-Cookie', '')
1489
1490 self.report_extraction(track_id)
1491
1492 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1493 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1494 try:
1495 track_list = json.loads(html_tracks)
1496 track = track_list[u'tracks'][0]
1497 except ValueError:
1498 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1499
1500 key = track[u"key"]
1501 track_id = track[u"id"]
1502 artist = track[u"artist"]
1503 title = track[u"song"]
1504
1505 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1506 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1507 request.add_header('cookie', cookie)
1508 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1509 try:
1510 song_data = json.loads(song_data_json)
1511 except ValueError:
1512 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1513 final_url = song_data[u"url"]
1514
1515 return [{
1516 'id': track_id,
1517 'url': final_url,
1518 'ext': "mp3",
1519 'title': title,
1520 'artist': artist,
1521 }]
1522
1523 class Vbox7IE(InfoExtractor):
1524 """Information Extractor for Vbox7"""
1525 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1526
1527 def _real_extract(self,url):
1528 mobj = re.match(self._VALID_URL, url)
1529 if mobj is None:
1530 raise ExtractorError(u'Invalid URL: %s' % url)
1531 video_id = mobj.group(1)
1532
1533 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1534 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1535 redirect_url = urlh.geturl() + new_location
1536 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1537
1538 title = self._html_search_regex(r'<title>(.*)</title>',
1539 webpage, u'title').split('/')[0].strip()
1540
1541 ext = "flv"
1542 info_url = "http://vbox7.com/play/magare.do"
1543 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1544 info_request = compat_urllib_request.Request(info_url, data)
1545 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1546 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1547 if info_response is None:
1548 raise ExtractorError(u'Unable to extract the media url')
1549 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1550
1551 return [{
1552 'id': video_id,
1553 'url': final_url,
1554 'ext': ext,
1555 'title': title,
1556 'thumbnail': thumbnail_url,
1557 }]
1558
1559
1560 def gen_extractors():
1561 """ Return a list of an instance of every supported extractor.
1562 The order does matter; the first extractor matched is the one handling the URL.
1563 """
1564 return [
1565 YoutubePlaylistIE(),
1566 YoutubeChannelIE(),
1567 YoutubeUserIE(),
1568 YoutubeSearchIE(),
1569 YoutubeIE(),
1570 MetacafeIE(),
1571 DailymotionIE(),
1572 GoogleSearchIE(),
1573 PhotobucketIE(),
1574 YahooIE(),
1575 YahooSearchIE(),
1576 DepositFilesIE(),
1577 FacebookIE(),
1578 BlipTVIE(),
1579 BlipTVUserIE(),
1580 VimeoIE(),
1581 MyVideoIE(),
1582 ComedyCentralIE(),
1583 EscapistIE(),
1584 CollegeHumorIE(),
1585 XVideosIE(),
1586 SoundcloudSetIE(),
1587 SoundcloudIE(),
1588 InfoQIE(),
1589 MixcloudIE(),
1590 StanfordOpenClassroomIE(),
1591 MTVIE(),
1592 YoukuIE(),
1593 XNXXIE(),
1594 YouJizzIE(),
1595 PornotubeIE(),
1596 YouPornIE(),
1597 GooglePlusIE(),
1598 ArteTvIE(),
1599 NBAIE(),
1600 WorldStarHipHopIE(),
1601 JustinTVIE(),
1602 FunnyOrDieIE(),
1603 SteamIE(),
1604 UstreamIE(),
1605 RBMARadioIE(),
1606 EightTracksIE(),
1607 KeekIE(),
1608 TEDIE(),
1609 MySpassIE(),
1610 SpiegelIE(),
1611 LiveLeakIE(),
1612 ARDIE(),
1613 ZDFIE(),
1614 TumblrIE(),
1615 BandcampIE(),
1616 RedTubeIE(),
1617 InaIE(),
1618 HowcastIE(),
1619 VineIE(),
1620 FlickrIE(),
1621 TeamcocoIE(),
1622 XHamsterIE(),
1623 HypemIE(),
1624 Vbox7IE(),
1625 GametrailersIE(),
1626 StatigramIE(),
1627 GenericIE()
1628 ]
1629
1630 def get_info_extractor(ie_name):
1631 """Returns the info extractor class with the given ie_name"""
1632 return globals()[ie_name+'IE']