]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Move FunnyOrDie into its own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.metacafe import MetacafeIE
37 from .extractor.mixcloud import MixcloudIE
38 from .extractor.mtv import MTVIE
39 from .extractor.myvideo import MyVideoIE
40 from .extractor.nba import NBAIE
41 from .extractor.statigram import StatigramIE
42 from .extractor.photobucket import PhotobucketIE
43 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
44 from .extractor.stanfordoc import StanfordOpenClassroomIE
45 from .extractor.steam import SteamIE
46 from .extractor.ted import TEDIE
47 from .extractor.vimeo import VimeoIE
48 from .extractor.worldstarhiphop import WorldStarHipHopIE
49 from .extractor.xnxx import XNXXIE
50 from .extractor.xvideos import XVideosIE
51 from .extractor.yahoo import YahooIE, YahooSearchIE
52 from .extractor.youku import YoukuIE
53 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
54 from .extractor.zdf import ZDFIE
55
56
57
58
59
60
61
62
63
64
65
66
67
68 class JustinTVIE(InfoExtractor):
69 """Information extractor for justin.tv and twitch.tv"""
70 # TODO: One broadcast may be split into multiple videos. The key
71 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
72 # starts at 1 and increases. Can we treat all parts as one video?
73
74 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
75 (?:
76 (?P<channelid>[^/]+)|
77 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
78 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
79 )
80 /?(?:\#.*)?$
81 """
82 _JUSTIN_PAGE_LIMIT = 100
83 IE_NAME = u'justin.tv'
84
85 def report_download_page(self, channel, offset):
86 """Report attempt to download a single page of videos."""
87 self.to_screen(u'%s: Downloading video information from %d to %d' %
88 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
89
90 # Return count of items, list of *valid* items
91 def _parse_page(self, url, video_id):
92 webpage = self._download_webpage(url, video_id,
93 u'Downloading video info JSON',
94 u'unable to download video info JSON')
95
96 response = json.loads(webpage)
97 if type(response) != list:
98 error_text = response.get('error', 'unknown error')
99 raise ExtractorError(u'Justin.tv API: %s' % error_text)
100 info = []
101 for clip in response:
102 video_url = clip['video_file_url']
103 if video_url:
104 video_extension = os.path.splitext(video_url)[1][1:]
105 video_date = re.sub('-', '', clip['start_time'][:10])
106 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
107 video_id = clip['id']
108 video_title = clip.get('title', video_id)
109 info.append({
110 'id': video_id,
111 'url': video_url,
112 'title': video_title,
113 'uploader': clip.get('channel_name', video_uploader_id),
114 'uploader_id': video_uploader_id,
115 'upload_date': video_date,
116 'ext': video_extension,
117 })
118 return (len(response), info)
119
120 def _real_extract(self, url):
121 mobj = re.match(self._VALID_URL, url)
122 if mobj is None:
123 raise ExtractorError(u'invalid URL: %s' % url)
124
125 api_base = 'http://api.justin.tv'
126 paged = False
127 if mobj.group('channelid'):
128 paged = True
129 video_id = mobj.group('channelid')
130 api = api_base + '/channel/archives/%s.json' % video_id
131 elif mobj.group('chapterid'):
132 chapter_id = mobj.group('chapterid')
133
134 webpage = self._download_webpage(url, chapter_id)
135 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
136 if not m:
137 raise ExtractorError(u'Cannot find archive of a chapter')
138 archive_id = m.group(1)
139
140 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
141 chapter_info_xml = self._download_webpage(api, chapter_id,
142 note=u'Downloading chapter information',
143 errnote=u'Chapter information download failed')
144 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
145 for a in doc.findall('.//archive'):
146 if archive_id == a.find('./id').text:
147 break
148 else:
149 raise ExtractorError(u'Could not find chapter in chapter information')
150
151 video_url = a.find('./video_file_url').text
152 video_ext = video_url.rpartition('.')[2] or u'flv'
153
154 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
155 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
156 note='Downloading chapter metadata',
157 errnote='Download of chapter metadata failed')
158 chapter_info = json.loads(chapter_info_json)
159
160 bracket_start = int(doc.find('.//bracket_start').text)
161 bracket_end = int(doc.find('.//bracket_end').text)
162
163 # TODO determine start (and probably fix up file)
164 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
165 #video_url += u'?start=' + TODO:start_timestamp
166 # bracket_start is 13290, but we want 51670615
167 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
168 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
169
170 info = {
171 'id': u'c' + chapter_id,
172 'url': video_url,
173 'ext': video_ext,
174 'title': chapter_info['title'],
175 'thumbnail': chapter_info['preview'],
176 'description': chapter_info['description'],
177 'uploader': chapter_info['channel']['display_name'],
178 'uploader_id': chapter_info['channel']['name'],
179 }
180 return [info]
181 else:
182 video_id = mobj.group('videoid')
183 api = api_base + '/broadcast/by_archive/%s.json' % video_id
184
185 self.report_extraction(video_id)
186
187 info = []
188 offset = 0
189 limit = self._JUSTIN_PAGE_LIMIT
190 while True:
191 if paged:
192 self.report_download_page(video_id, offset)
193 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
194 page_count, page_info = self._parse_page(page_url, video_id)
195 info.extend(page_info)
196 if not paged or page_count != limit:
197 break
198 offset += limit
199 return info
200
201
202
203 class UstreamIE(InfoExtractor):
204 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
205 IE_NAME = u'ustream'
206
207 def _real_extract(self, url):
208 m = re.match(self._VALID_URL, url)
209 video_id = m.group('videoID')
210
211 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
212 webpage = self._download_webpage(url, video_id)
213
214 self.report_extraction(video_id)
215
216 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
217 webpage, u'title')
218
219 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
220 webpage, u'uploader', fatal=False, flags=re.DOTALL)
221
222 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
223 webpage, u'thumbnail', fatal=False)
224
225 info = {
226 'id': video_id,
227 'url': video_url,
228 'ext': 'flv',
229 'title': video_title,
230 'uploader': uploader,
231 'thumbnail': thumbnail,
232 }
233 return info
234
235
236 class RBMARadioIE(InfoExtractor):
237 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
238
239 def _real_extract(self, url):
240 m = re.match(self._VALID_URL, url)
241 video_id = m.group('videoID')
242
243 webpage = self._download_webpage(url, video_id)
244
245 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
246 webpage, u'json data', flags=re.MULTILINE)
247
248 try:
249 data = json.loads(json_data)
250 except ValueError as e:
251 raise ExtractorError(u'Invalid JSON: ' + str(e))
252
253 video_url = data['akamai_url'] + '&cbr=256'
254 url_parts = compat_urllib_parse_urlparse(video_url)
255 video_ext = url_parts.path.rpartition('.')[2]
256 info = {
257 'id': video_id,
258 'url': video_url,
259 'ext': video_ext,
260 'title': data['title'],
261 'description': data.get('teaser_text'),
262 'location': data.get('country_of_origin'),
263 'uploader': data.get('host', {}).get('name'),
264 'uploader_id': data.get('host', {}).get('slug'),
265 'thumbnail': data.get('image', {}).get('large_url_2x'),
266 'duration': data.get('duration'),
267 }
268 return [info]
269
270
271 class YouPornIE(InfoExtractor):
272 """Information extractor for youporn.com."""
273 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
274
275 def _print_formats(self, formats):
276 """Print all available formats"""
277 print(u'Available formats:')
278 print(u'ext\t\tformat')
279 print(u'---------------------------------')
280 for format in formats:
281 print(u'%s\t\t%s' % (format['ext'], format['format']))
282
283 def _specific(self, req_format, formats):
284 for x in formats:
285 if(x["format"]==req_format):
286 return x
287 return None
288
289 def _real_extract(self, url):
290 mobj = re.match(self._VALID_URL, url)
291 if mobj is None:
292 raise ExtractorError(u'Invalid URL: %s' % url)
293 video_id = mobj.group('videoid')
294
295 req = compat_urllib_request.Request(url)
296 req.add_header('Cookie', 'age_verified=1')
297 webpage = self._download_webpage(req, video_id)
298
299 # Get JSON parameters
300 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
301 try:
302 params = json.loads(json_params)
303 except:
304 raise ExtractorError(u'Invalid JSON')
305
306 self.report_extraction(video_id)
307 try:
308 video_title = params['title']
309 upload_date = unified_strdate(params['release_date_f'])
310 video_description = params['description']
311 video_uploader = params['submitted_by']
312 thumbnail = params['thumbnails'][0]['image']
313 except KeyError:
314 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
315
316 # Get all of the formats available
317 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
318 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
319 webpage, u'download list').strip()
320
321 # Get all of the links from the page
322 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
323 links = re.findall(LINK_RE, download_list_html)
324 if(len(links) == 0):
325 raise ExtractorError(u'ERROR: no known formats available for video')
326
327 self.to_screen(u'Links found: %d' % len(links))
328
329 formats = []
330 for link in links:
331
332 # A link looks like this:
333 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
334 # A path looks like this:
335 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
336 video_url = unescapeHTML( link )
337 path = compat_urllib_parse_urlparse( video_url ).path
338 extension = os.path.splitext( path )[1][1:]
339 format = path.split('/')[4].split('_')[:2]
340 size = format[0]
341 bitrate = format[1]
342 format = "-".join( format )
343 # title = u'%s-%s-%s' % (video_title, size, bitrate)
344
345 formats.append({
346 'id': video_id,
347 'url': video_url,
348 'uploader': video_uploader,
349 'upload_date': upload_date,
350 'title': video_title,
351 'ext': extension,
352 'format': format,
353 'thumbnail': thumbnail,
354 'description': video_description
355 })
356
357 if self._downloader.params.get('listformats', None):
358 self._print_formats(formats)
359 return
360
361 req_format = self._downloader.params.get('format', None)
362 self.to_screen(u'Format: %s' % req_format)
363
364 if req_format is None or req_format == 'best':
365 return [formats[0]]
366 elif req_format == 'worst':
367 return [formats[-1]]
368 elif req_format in ('-1', 'all'):
369 return formats
370 else:
371 format = self._specific( req_format, formats )
372 if result is None:
373 raise ExtractorError(u'Requested format not available')
374 return [format]
375
376
377
378 class PornotubeIE(InfoExtractor):
379 """Information extractor for pornotube.com."""
380 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
381
382 def _real_extract(self, url):
383 mobj = re.match(self._VALID_URL, url)
384 if mobj is None:
385 raise ExtractorError(u'Invalid URL: %s' % url)
386
387 video_id = mobj.group('videoid')
388 video_title = mobj.group('title')
389
390 # Get webpage content
391 webpage = self._download_webpage(url, video_id)
392
393 # Get the video URL
394 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
395 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
396 video_url = compat_urllib_parse.unquote(video_url)
397
398 #Get the uploaded date
399 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
400 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
401 if upload_date: upload_date = unified_strdate(upload_date)
402
403 info = {'id': video_id,
404 'url': video_url,
405 'uploader': None,
406 'upload_date': upload_date,
407 'title': video_title,
408 'ext': 'flv',
409 'format': 'flv'}
410
411 return [info]
412
413 class YouJizzIE(InfoExtractor):
414 """Information extractor for youjizz.com."""
415 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
416
417 def _real_extract(self, url):
418 mobj = re.match(self._VALID_URL, url)
419 if mobj is None:
420 raise ExtractorError(u'Invalid URL: %s' % url)
421
422 video_id = mobj.group('videoid')
423
424 # Get webpage content
425 webpage = self._download_webpage(url, video_id)
426
427 # Get the video title
428 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
429 webpage, u'title').strip()
430
431 # Get the embed page
432 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
433 if result is None:
434 raise ExtractorError(u'ERROR: unable to extract embed page')
435
436 embed_page_url = result.group(0).strip()
437 video_id = result.group('videoid')
438
439 webpage = self._download_webpage(embed_page_url, video_id)
440
441 # Get the video URL
442 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
443 webpage, u'video URL')
444
445 info = {'id': video_id,
446 'url': video_url,
447 'title': video_title,
448 'ext': 'flv',
449 'format': 'flv',
450 'player_url': embed_page_url}
451
452 return [info]
453
454 class EightTracksIE(InfoExtractor):
455 IE_NAME = '8tracks'
456 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
457
458 def _real_extract(self, url):
459 mobj = re.match(self._VALID_URL, url)
460 if mobj is None:
461 raise ExtractorError(u'Invalid URL: %s' % url)
462 playlist_id = mobj.group('id')
463
464 webpage = self._download_webpage(url, playlist_id)
465
466 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
467 data = json.loads(json_like)
468
469 session = str(random.randint(0, 1000000000))
470 mix_id = data['id']
471 track_count = data['tracks_count']
472 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
473 next_url = first_url
474 res = []
475 for i in itertools.count():
476 api_json = self._download_webpage(next_url, playlist_id,
477 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
478 errnote=u'Failed to download song information')
479 api_data = json.loads(api_json)
480 track_data = api_data[u'set']['track']
481 info = {
482 'id': track_data['id'],
483 'url': track_data['track_file_stream_url'],
484 'title': track_data['performer'] + u' - ' + track_data['name'],
485 'raw_title': track_data['name'],
486 'uploader_id': data['user']['login'],
487 'ext': 'm4a',
488 }
489 res.append(info)
490 if api_data['set']['at_last_track']:
491 break
492 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
493 return res
494
495 class KeekIE(InfoExtractor):
496 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
497 IE_NAME = u'keek'
498
499 def _real_extract(self, url):
500 m = re.match(self._VALID_URL, url)
501 video_id = m.group('videoID')
502
503 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
504 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
505 webpage = self._download_webpage(url, video_id)
506
507 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
508 webpage, u'title')
509
510 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
511 webpage, u'uploader', fatal=False)
512
513 info = {
514 'id': video_id,
515 'url': video_url,
516 'ext': 'mp4',
517 'title': video_title,
518 'thumbnail': thumbnail,
519 'uploader': uploader
520 }
521 return [info]
522
523
524 class MySpassIE(InfoExtractor):
525 _VALID_URL = r'http://www.myspass.de/.*'
526
527 def _real_extract(self, url):
528 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
529
530 # video id is the last path element of the URL
531 # usually there is a trailing slash, so also try the second but last
532 url_path = compat_urllib_parse_urlparse(url).path
533 url_parent_path, video_id = os.path.split(url_path)
534 if not video_id:
535 _, video_id = os.path.split(url_parent_path)
536
537 # get metadata
538 metadata_url = META_DATA_URL_TEMPLATE % video_id
539 metadata_text = self._download_webpage(metadata_url, video_id)
540 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
541
542 # extract values from metadata
543 url_flv_el = metadata.find('url_flv')
544 if url_flv_el is None:
545 raise ExtractorError(u'Unable to extract download url')
546 video_url = url_flv_el.text
547 extension = os.path.splitext(video_url)[1][1:]
548 title_el = metadata.find('title')
549 if title_el is None:
550 raise ExtractorError(u'Unable to extract title')
551 title = title_el.text
552 format_id_el = metadata.find('format_id')
553 if format_id_el is None:
554 format = ext
555 else:
556 format = format_id_el.text
557 description_el = metadata.find('description')
558 if description_el is not None:
559 description = description_el.text
560 else:
561 description = None
562 imagePreview_el = metadata.find('imagePreview')
563 if imagePreview_el is not None:
564 thumbnail = imagePreview_el.text
565 else:
566 thumbnail = None
567 info = {
568 'id': video_id,
569 'url': video_url,
570 'title': title,
571 'ext': extension,
572 'format': format,
573 'thumbnail': thumbnail,
574 'description': description
575 }
576 return [info]
577
578 class SpiegelIE(InfoExtractor):
579 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
580
581 def _real_extract(self, url):
582 m = re.match(self._VALID_URL, url)
583 video_id = m.group('videoID')
584
585 webpage = self._download_webpage(url, video_id)
586
587 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
588 webpage, u'title')
589
590 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
591 xml_code = self._download_webpage(xml_url, video_id,
592 note=u'Downloading XML', errnote=u'Failed to download XML')
593
594 idoc = xml.etree.ElementTree.fromstring(xml_code)
595 last_type = idoc[-1]
596 filename = last_type.findall('./filename')[0].text
597 duration = float(last_type.findall('./duration')[0].text)
598
599 video_url = 'http://video2.spiegel.de/flash/' + filename
600 video_ext = filename.rpartition('.')[2]
601 info = {
602 'id': video_id,
603 'url': video_url,
604 'ext': video_ext,
605 'title': video_title,
606 'duration': duration,
607 }
608 return [info]
609
610 class LiveLeakIE(InfoExtractor):
611
612 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
613 IE_NAME = u'liveleak'
614
615 def _real_extract(self, url):
616 mobj = re.match(self._VALID_URL, url)
617 if mobj is None:
618 raise ExtractorError(u'Invalid URL: %s' % url)
619
620 video_id = mobj.group('video_id')
621
622 webpage = self._download_webpage(url, video_id)
623
624 video_url = self._search_regex(r'file: "(.*?)",',
625 webpage, u'video URL')
626
627 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
628 webpage, u'title').replace('LiveLeak.com -', '').strip()
629
630 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
631 webpage, u'description', fatal=False)
632
633 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
634 webpage, u'uploader', fatal=False)
635
636 info = {
637 'id': video_id,
638 'url': video_url,
639 'ext': 'mp4',
640 'title': video_title,
641 'description': video_description,
642 'uploader': video_uploader
643 }
644
645 return [info]
646
647
648
649 class TumblrIE(InfoExtractor):
650 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
651
652 def _real_extract(self, url):
653 m_url = re.match(self._VALID_URL, url)
654 video_id = m_url.group('id')
655 blog = m_url.group('blog_name')
656
657 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
658 webpage = self._download_webpage(url, video_id)
659
660 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
661 video = re.search(re_video, webpage)
662 if video is None:
663 raise ExtractorError(u'Unable to extract video')
664 video_url = video.group('video_url')
665 ext = video.group('ext')
666
667 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
668 webpage, u'thumbnail', fatal=False) # We pick the first poster
669 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
670
671 # The only place where you can get a title, it's not complete,
672 # but searching in other places doesn't work for all videos
673 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
674 webpage, u'title', flags=re.DOTALL)
675
676 return [{'id': video_id,
677 'url': video_url,
678 'title': video_title,
679 'thumbnail': video_thumbnail,
680 'ext': ext
681 }]
682
683 class BandcampIE(InfoExtractor):
684 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
685
686 def _real_extract(self, url):
687 mobj = re.match(self._VALID_URL, url)
688 title = mobj.group('title')
689 webpage = self._download_webpage(url, title)
690 # We get the link to the free download page
691 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
692 if m_download is None:
693 raise ExtractorError(u'No free songs found')
694
695 download_link = m_download.group(1)
696 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
697 webpage, re.MULTILINE|re.DOTALL).group('id')
698
699 download_webpage = self._download_webpage(download_link, id,
700 'Downloading free downloads page')
701 # We get the dictionary of the track from some javascrip code
702 info = re.search(r'items: (.*?),$',
703 download_webpage, re.MULTILINE).group(1)
704 info = json.loads(info)[0]
705 # We pick mp3-320 for now, until format selection can be easily implemented.
706 mp3_info = info[u'downloads'][u'mp3-320']
707 # If we try to use this url it says the link has expired
708 initial_url = mp3_info[u'url']
709 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
710 m_url = re.match(re_url, initial_url)
711 #We build the url we will use to get the final track url
712 # This url is build in Bandcamp in the script download_bunde_*.js
713 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
714 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
715 # If we could correctly generate the .rand field the url would be
716 #in the "download_url" key
717 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
718
719 track_info = {'id':id,
720 'title' : info[u'title'],
721 'ext' : 'mp3',
722 'url' : final_url,
723 'thumbnail' : info[u'thumb_url'],
724 'uploader' : info[u'artist']
725 }
726
727 return [track_info]
728
729 class RedTubeIE(InfoExtractor):
730 """Information Extractor for redtube"""
731 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
732
733 def _real_extract(self,url):
734 mobj = re.match(self._VALID_URL, url)
735 if mobj is None:
736 raise ExtractorError(u'Invalid URL: %s' % url)
737
738 video_id = mobj.group('id')
739 video_extension = 'mp4'
740 webpage = self._download_webpage(url, video_id)
741
742 self.report_extraction(video_id)
743
744 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
745 webpage, u'video URL')
746
747 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
748 webpage, u'title')
749
750 return [{
751 'id': video_id,
752 'url': video_url,
753 'ext': video_extension,
754 'title': video_title,
755 }]
756
757 class InaIE(InfoExtractor):
758 """Information Extractor for Ina.fr"""
759 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
760
761 def _real_extract(self,url):
762 mobj = re.match(self._VALID_URL, url)
763
764 video_id = mobj.group('id')
765 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
766 video_extension = 'mp4'
767 webpage = self._download_webpage(mrss_url, video_id)
768
769 self.report_extraction(video_id)
770
771 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
772 webpage, u'video URL')
773
774 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
775 webpage, u'title')
776
777 return [{
778 'id': video_id,
779 'url': video_url,
780 'ext': video_extension,
781 'title': video_title,
782 }]
783
784 class HowcastIE(InfoExtractor):
785 """Information Extractor for Howcast.com"""
786 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
787
788 def _real_extract(self, url):
789 mobj = re.match(self._VALID_URL, url)
790
791 video_id = mobj.group('id')
792 webpage_url = 'http://www.howcast.com/videos/' + video_id
793 webpage = self._download_webpage(webpage_url, video_id)
794
795 self.report_extraction(video_id)
796
797 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
798 webpage, u'video URL')
799
800 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
801 webpage, u'title')
802
803 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
804 webpage, u'description', fatal=False)
805
806 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
807 webpage, u'thumbnail', fatal=False)
808
809 return [{
810 'id': video_id,
811 'url': video_url,
812 'ext': 'mp4',
813 'title': video_title,
814 'description': video_description,
815 'thumbnail': thumbnail,
816 }]
817
818 class VineIE(InfoExtractor):
819 """Information Extractor for Vine.co"""
820 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
821
822 def _real_extract(self, url):
823 mobj = re.match(self._VALID_URL, url)
824
825 video_id = mobj.group('id')
826 webpage_url = 'https://vine.co/v/' + video_id
827 webpage = self._download_webpage(webpage_url, video_id)
828
829 self.report_extraction(video_id)
830
831 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
832 webpage, u'video URL')
833
834 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
835 webpage, u'title')
836
837 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
838 webpage, u'thumbnail', fatal=False)
839
840 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
841 webpage, u'uploader', fatal=False, flags=re.DOTALL)
842
843 return [{
844 'id': video_id,
845 'url': video_url,
846 'ext': 'mp4',
847 'title': video_title,
848 'thumbnail': thumbnail,
849 'uploader': uploader,
850 }]
851
852 class FlickrIE(InfoExtractor):
853 """Information Extractor for Flickr videos"""
854 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
855
856 def _real_extract(self, url):
857 mobj = re.match(self._VALID_URL, url)
858
859 video_id = mobj.group('id')
860 video_uploader_id = mobj.group('uploader_id')
861 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
862 webpage = self._download_webpage(webpage_url, video_id)
863
864 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
865
866 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
867 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
868
869 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
870 first_xml, u'node_id')
871
872 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
873 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
874
875 self.report_extraction(video_id)
876
877 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
878 if mobj is None:
879 raise ExtractorError(u'Unable to extract video url')
880 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
881
882 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
883 webpage, u'video title')
884
885 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
886 webpage, u'description', fatal=False)
887
888 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
889 webpage, u'thumbnail', fatal=False)
890
891 return [{
892 'id': video_id,
893 'url': video_url,
894 'ext': 'mp4',
895 'title': video_title,
896 'description': video_description,
897 'thumbnail': thumbnail,
898 'uploader_id': video_uploader_id,
899 }]
900
901 class TeamcocoIE(InfoExtractor):
902 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
903
904 def _real_extract(self, url):
905 mobj = re.match(self._VALID_URL, url)
906 if mobj is None:
907 raise ExtractorError(u'Invalid URL: %s' % url)
908 url_title = mobj.group('url_title')
909 webpage = self._download_webpage(url, url_title)
910
911 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
912 webpage, u'video id')
913
914 self.report_extraction(video_id)
915
916 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
917 webpage, u'title')
918
919 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
920 webpage, u'thumbnail', fatal=False)
921
922 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
923 webpage, u'description', fatal=False)
924
925 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
926 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
927
928 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
929 data, u'video URL')
930
931 return [{
932 'id': video_id,
933 'url': video_url,
934 'ext': 'mp4',
935 'title': video_title,
936 'thumbnail': thumbnail,
937 'description': video_description,
938 }]
939
940 class XHamsterIE(InfoExtractor):
941 """Information Extractor for xHamster"""
942 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
943
944 def _real_extract(self,url):
945 mobj = re.match(self._VALID_URL, url)
946
947 video_id = mobj.group('id')
948 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
949 webpage = self._download_webpage(mrss_url, video_id)
950
951 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
952 if mobj is None:
953 raise ExtractorError(u'Unable to extract media URL')
954 if len(mobj.group('server')) == 0:
955 video_url = compat_urllib_parse.unquote(mobj.group('file'))
956 else:
957 video_url = mobj.group('server')+'/key='+mobj.group('file')
958 video_extension = video_url.split('.')[-1]
959
960 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
961 webpage, u'title')
962
963 # Can't see the description anywhere in the UI
964 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
965 # webpage, u'description', fatal=False)
966 # if video_description: video_description = unescapeHTML(video_description)
967
968 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
969 if mobj:
970 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
971 else:
972 video_upload_date = None
973 self._downloader.report_warning(u'Unable to extract upload date')
974
975 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
976 webpage, u'uploader id', default=u'anonymous')
977
978 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
979 webpage, u'thumbnail', fatal=False)
980
981 return [{
982 'id': video_id,
983 'url': video_url,
984 'ext': video_extension,
985 'title': video_title,
986 # 'description': video_description,
987 'upload_date': video_upload_date,
988 'uploader_id': video_uploader_id,
989 'thumbnail': video_thumbnail
990 }]
991
992 class HypemIE(InfoExtractor):
993 """Information Extractor for hypem"""
994 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
995
996 def _real_extract(self, url):
997 mobj = re.match(self._VALID_URL, url)
998 if mobj is None:
999 raise ExtractorError(u'Invalid URL: %s' % url)
1000 track_id = mobj.group(1)
1001
1002 data = { 'ax': 1, 'ts': time.time() }
1003 data_encoded = compat_urllib_parse.urlencode(data)
1004 complete_url = url + "?" + data_encoded
1005 request = compat_urllib_request.Request(complete_url)
1006 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1007 cookie = urlh.headers.get('Set-Cookie', '')
1008
1009 self.report_extraction(track_id)
1010
1011 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1012 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1013 try:
1014 track_list = json.loads(html_tracks)
1015 track = track_list[u'tracks'][0]
1016 except ValueError:
1017 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1018
1019 key = track[u"key"]
1020 track_id = track[u"id"]
1021 artist = track[u"artist"]
1022 title = track[u"song"]
1023
1024 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1025 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1026 request.add_header('cookie', cookie)
1027 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1028 try:
1029 song_data = json.loads(song_data_json)
1030 except ValueError:
1031 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1032 final_url = song_data[u"url"]
1033
1034 return [{
1035 'id': track_id,
1036 'url': final_url,
1037 'ext': "mp3",
1038 'title': title,
1039 'artist': artist,
1040 }]
1041
1042 class Vbox7IE(InfoExtractor):
1043 """Information Extractor for Vbox7"""
1044 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1045
1046 def _real_extract(self,url):
1047 mobj = re.match(self._VALID_URL, url)
1048 if mobj is None:
1049 raise ExtractorError(u'Invalid URL: %s' % url)
1050 video_id = mobj.group(1)
1051
1052 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1053 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1054 redirect_url = urlh.geturl() + new_location
1055 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1056
1057 title = self._html_search_regex(r'<title>(.*)</title>',
1058 webpage, u'title').split('/')[0].strip()
1059
1060 ext = "flv"
1061 info_url = "http://vbox7.com/play/magare.do"
1062 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1063 info_request = compat_urllib_request.Request(info_url, data)
1064 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1065 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1066 if info_response is None:
1067 raise ExtractorError(u'Unable to extract the media url')
1068 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1069
1070 return [{
1071 'id': video_id,
1072 'url': final_url,
1073 'ext': ext,
1074 'title': title,
1075 'thumbnail': thumbnail_url,
1076 }]
1077
1078
1079 def gen_extractors():
1080 """ Return a list of an instance of every supported extractor.
1081 The order does matter; the first extractor matched is the one handling the URL.
1082 """
1083 return [
1084 YoutubePlaylistIE(),
1085 YoutubeChannelIE(),
1086 YoutubeUserIE(),
1087 YoutubeSearchIE(),
1088 YoutubeIE(),
1089 MetacafeIE(),
1090 DailymotionIE(),
1091 GoogleSearchIE(),
1092 PhotobucketIE(),
1093 YahooIE(),
1094 YahooSearchIE(),
1095 DepositFilesIE(),
1096 FacebookIE(),
1097 BlipTVIE(),
1098 BlipTVUserIE(),
1099 VimeoIE(),
1100 MyVideoIE(),
1101 ComedyCentralIE(),
1102 EscapistIE(),
1103 CollegeHumorIE(),
1104 XVideosIE(),
1105 SoundcloudSetIE(),
1106 SoundcloudIE(),
1107 InfoQIE(),
1108 MixcloudIE(),
1109 StanfordOpenClassroomIE(),
1110 MTVIE(),
1111 YoukuIE(),
1112 XNXXIE(),
1113 YouJizzIE(),
1114 PornotubeIE(),
1115 YouPornIE(),
1116 GooglePlusIE(),
1117 ArteTvIE(),
1118 NBAIE(),
1119 WorldStarHipHopIE(),
1120 JustinTVIE(),
1121 FunnyOrDieIE(),
1122 SteamIE(),
1123 UstreamIE(),
1124 RBMARadioIE(),
1125 EightTracksIE(),
1126 KeekIE(),
1127 TEDIE(),
1128 MySpassIE(),
1129 SpiegelIE(),
1130 LiveLeakIE(),
1131 ARDIE(),
1132 ZDFIE(),
1133 TumblrIE(),
1134 BandcampIE(),
1135 RedTubeIE(),
1136 InaIE(),
1137 HowcastIE(),
1138 VineIE(),
1139 FlickrIE(),
1140 TeamcocoIE(),
1141 XHamsterIE(),
1142 HypemIE(),
1143 Vbox7IE(),
1144 GametrailersIE(),
1145 StatigramIE(),
1146 GenericIE()
1147 ]
1148
1149 def get_info_extractor(ie_name):
1150 """Returns the info extractor class with the given ie_name"""
1151 return globals()[ie_name+'IE']