]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Remove useless headers
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.dailymotion import DailymotionIE
24 from .extractor.gametrailers import GametrailersIE
25 from .extractor.generic import GenericIE
26 from .extractor.metacafe import MetacafeIE
27 from .extractor.statigram import StatigramIE
28 from .extractor.photobucket import PhotobucketIE
29 from .extractor.vimeo import VimeoIE
30 from .extractor.yahoo import YahooIE
31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
32 from .extractor.zdf import ZDFIE
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48 class YahooSearchIE(SearchInfoExtractor):
49 """Information Extractor for Yahoo! Video search queries."""
50
51 _MAX_RESULTS = 1000
52 IE_NAME = u'screen.yahoo:search'
53 _SEARCH_KEY = 'yvsearch'
54
55 def _get_n_results(self, query, n):
56 """Get a specified number of results for a query"""
57
58 res = {
59 '_type': 'playlist',
60 'id': query,
61 'entries': []
62 }
63 for pagenum in itertools.count(0):
64 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
65 webpage = self._download_webpage(result_url, query,
66 note='Downloading results page '+str(pagenum+1))
67 info = json.loads(webpage)
68 m = info[u'm']
69 results = info[u'results']
70
71 for (i, r) in enumerate(results):
72 if (pagenum * 30) +i >= n:
73 break
74 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
75 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
76 res['entries'].append(e)
77 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
78 break
79
80 return res
81
82
83 class BlipTVUserIE(InfoExtractor):
84 """Information Extractor for blip.tv users."""
85
86 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
87 _PAGE_SIZE = 12
88 IE_NAME = u'blip.tv:user'
89
90 def _real_extract(self, url):
91 # Extract username
92 mobj = re.match(self._VALID_URL, url)
93 if mobj is None:
94 raise ExtractorError(u'Invalid URL: %s' % url)
95
96 username = mobj.group(1)
97
98 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
99
100 page = self._download_webpage(url, username, u'Downloading user page')
101 mobj = re.search(r'data-users-id="([^"]+)"', page)
102 page_base = page_base % mobj.group(1)
103
104
105 # Download video ids using BlipTV Ajax calls. Result size per
106 # query is limited (currently to 12 videos) so we need to query
107 # page by page until there are no video ids - it means we got
108 # all of them.
109
110 video_ids = []
111 pagenum = 1
112
113 while True:
114 url = page_base + "&page=" + str(pagenum)
115 page = self._download_webpage(url, username,
116 u'Downloading video ids from page %d' % pagenum)
117
118 # Extract video identifiers
119 ids_in_page = []
120
121 for mobj in re.finditer(r'href="/([^"]+)"', page):
122 if mobj.group(1) not in ids_in_page:
123 ids_in_page.append(unescapeHTML(mobj.group(1)))
124
125 video_ids.extend(ids_in_page)
126
127 # A little optimization - if current page is not
128 # "full", ie. does not contain PAGE_SIZE video ids then
129 # we can assume that this page is the last one - there
130 # are no more ids on further pages - no need to query
131 # again.
132
133 if len(ids_in_page) < self._PAGE_SIZE:
134 break
135
136 pagenum += 1
137
138 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
139 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
140 return [self.playlist_result(url_entries, playlist_title = username)]
141
142
143 class DepositFilesIE(InfoExtractor):
144 """Information extractor for depositfiles.com"""
145
146 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
147
148 def _real_extract(self, url):
149 file_id = url.split('/')[-1]
150 # Rebuild url in english locale
151 url = 'http://depositfiles.com/en/files/' + file_id
152
153 # Retrieve file webpage with 'Free download' button pressed
154 free_download_indication = { 'gateway_result' : '1' }
155 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
156 try:
157 self.report_download_webpage(file_id)
158 webpage = compat_urllib_request.urlopen(request).read()
159 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
160 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
161
162 # Search for the real file URL
163 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
164 if (mobj is None) or (mobj.group(1) is None):
165 # Try to figure out reason of the error.
166 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
167 if (mobj is not None) and (mobj.group(1) is not None):
168 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
169 raise ExtractorError(u'%s' % restriction_message)
170 else:
171 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
172
173 file_url = mobj.group(1)
174 file_extension = os.path.splitext(file_url)[1][1:]
175
176 # Search for file title
177 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
178
179 return [{
180 'id': file_id.decode('utf-8'),
181 'url': file_url.decode('utf-8'),
182 'uploader': None,
183 'upload_date': None,
184 'title': file_title,
185 'ext': file_extension.decode('utf-8'),
186 }]
187
188
189 class FacebookIE(InfoExtractor):
190 """Information Extractor for Facebook"""
191
192 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
193 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
194 _NETRC_MACHINE = 'facebook'
195 IE_NAME = u'facebook'
196
197 def report_login(self):
198 """Report attempt to log in."""
199 self.to_screen(u'Logging in')
200
201 def _real_initialize(self):
202 if self._downloader is None:
203 return
204
205 useremail = None
206 password = None
207 downloader_params = self._downloader.params
208
209 # Attempt to use provided username and password or .netrc data
210 if downloader_params.get('username', None) is not None:
211 useremail = downloader_params['username']
212 password = downloader_params['password']
213 elif downloader_params.get('usenetrc', False):
214 try:
215 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
216 if info is not None:
217 useremail = info[0]
218 password = info[2]
219 else:
220 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
221 except (IOError, netrc.NetrcParseError) as err:
222 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
223 return
224
225 if useremail is None:
226 return
227
228 # Log in
229 login_form = {
230 'email': useremail,
231 'pass': password,
232 'login': 'Log+In'
233 }
234 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
235 try:
236 self.report_login()
237 login_results = compat_urllib_request.urlopen(request).read()
238 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
239 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
240 return
241 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
242 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
243 return
244
245 def _real_extract(self, url):
246 mobj = re.match(self._VALID_URL, url)
247 if mobj is None:
248 raise ExtractorError(u'Invalid URL: %s' % url)
249 video_id = mobj.group('ID')
250
251 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
252 webpage = self._download_webpage(url, video_id)
253
254 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
255 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
256 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
257 if not m:
258 raise ExtractorError(u'Cannot parse data')
259 data = dict(json.loads(m.group(1)))
260 params_raw = compat_urllib_parse.unquote(data['params'])
261 params = json.loads(params_raw)
262 video_data = params['video_data'][0]
263 video_url = video_data.get('hd_src')
264 if not video_url:
265 video_url = video_data['sd_src']
266 if not video_url:
267 raise ExtractorError(u'Cannot find video URL')
268 video_duration = int(video_data['video_duration'])
269 thumbnail = video_data['thumbnail_src']
270
271 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
272 webpage, u'title')
273
274 info = {
275 'id': video_id,
276 'title': video_title,
277 'url': video_url,
278 'ext': 'mp4',
279 'duration': video_duration,
280 'thumbnail': thumbnail,
281 }
282 return [info]
283
284
285 class BlipTVIE(InfoExtractor):
286 """Information extractor for blip.tv"""
287
288 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
289 _URL_EXT = r'^.*\.([a-z0-9]+)$'
290 IE_NAME = u'blip.tv'
291
292 def report_direct_download(self, title):
293 """Report information extraction."""
294 self.to_screen(u'%s: Direct download detected' % title)
295
296 def _real_extract(self, url):
297 mobj = re.match(self._VALID_URL, url)
298 if mobj is None:
299 raise ExtractorError(u'Invalid URL: %s' % url)
300
301 # See https://github.com/rg3/youtube-dl/issues/857
302 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
303 if api_mobj is not None:
304 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
305 urlp = compat_urllib_parse_urlparse(url)
306 if urlp.path.startswith('/play/'):
307 request = compat_urllib_request.Request(url)
308 response = compat_urllib_request.urlopen(request)
309 redirecturl = response.geturl()
310 rurlp = compat_urllib_parse_urlparse(redirecturl)
311 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
312 url = 'http://blip.tv/a/a-' + file_id
313 return self._real_extract(url)
314
315
316 if '?' in url:
317 cchar = '&'
318 else:
319 cchar = '?'
320 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
321 request = compat_urllib_request.Request(json_url)
322 request.add_header('User-Agent', 'iTunes/10.6.1')
323 self.report_extraction(mobj.group(1))
324 info = None
325 try:
326 urlh = compat_urllib_request.urlopen(request)
327 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
328 basename = url.split('/')[-1]
329 title,ext = os.path.splitext(basename)
330 title = title.decode('UTF-8')
331 ext = ext.replace('.', '')
332 self.report_direct_download(title)
333 info = {
334 'id': title,
335 'url': url,
336 'uploader': None,
337 'upload_date': None,
338 'title': title,
339 'ext': ext,
340 'urlhandle': urlh
341 }
342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
344 if info is None: # Regular URL
345 try:
346 json_code_bytes = urlh.read()
347 json_code = json_code_bytes.decode('utf-8')
348 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
349 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
350
351 try:
352 json_data = json.loads(json_code)
353 if 'Post' in json_data:
354 data = json_data['Post']
355 else:
356 data = json_data
357
358 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
359 video_url = data['media']['url']
360 umobj = re.match(self._URL_EXT, video_url)
361 if umobj is None:
362 raise ValueError('Can not determine filename extension')
363 ext = umobj.group(1)
364
365 info = {
366 'id': data['item_id'],
367 'url': video_url,
368 'uploader': data['display_name'],
369 'upload_date': upload_date,
370 'title': data['title'],
371 'ext': ext,
372 'format': data['media']['mimeType'],
373 'thumbnail': data['thumbnailUrl'],
374 'description': data['description'],
375 'player_url': data['embedUrl'],
376 'user_agent': 'iTunes/10.6.1',
377 }
378 except (ValueError,KeyError) as err:
379 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
380
381 return [info]
382
383
384 class MyVideoIE(InfoExtractor):
385 """Information Extractor for myvideo.de."""
386
387 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
388 IE_NAME = u'myvideo'
389
390 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
391 # Released into the Public Domain by Tristan Fischer on 2013-05-19
392 # https://github.com/rg3/youtube-dl/pull/842
393 def __rc4crypt(self,data, key):
394 x = 0
395 box = list(range(256))
396 for i in list(range(256)):
397 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
398 box[i], box[x] = box[x], box[i]
399 x = 0
400 y = 0
401 out = ''
402 for char in data:
403 x = (x + 1) % 256
404 y = (y + box[x]) % 256
405 box[x], box[y] = box[y], box[x]
406 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
407 return out
408
409 def __md5(self,s):
410 return hashlib.md5(s).hexdigest().encode()
411
412 def _real_extract(self,url):
413 mobj = re.match(self._VALID_URL, url)
414 if mobj is None:
415 raise ExtractorError(u'invalid URL: %s' % url)
416
417 video_id = mobj.group(1)
418
419 GK = (
420 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
421 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
422 b'TnpsbA0KTVRkbU1tSTRNdz09'
423 )
424
425 # Get video webpage
426 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
427 webpage = self._download_webpage(webpage_url, video_id)
428
429 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
430 if mobj is not None:
431 self.report_extraction(video_id)
432 video_url = mobj.group(1) + '.flv'
433
434 video_title = self._html_search_regex('<title>([^<]+)</title>',
435 webpage, u'title')
436
437 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
438
439 return [{
440 'id': video_id,
441 'url': video_url,
442 'uploader': None,
443 'upload_date': None,
444 'title': video_title,
445 'ext': u'flv',
446 }]
447
448 # try encxml
449 mobj = re.search('var flashvars={(.+?)}', webpage)
450 if mobj is None:
451 raise ExtractorError(u'Unable to extract video')
452
453 params = {}
454 encxml = ''
455 sec = mobj.group(1)
456 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
457 if not a == '_encxml':
458 params[a] = b
459 else:
460 encxml = compat_urllib_parse.unquote(b)
461 if not params.get('domain'):
462 params['domain'] = 'www.myvideo.de'
463 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
464 if 'flash_playertype=MTV' in xmldata_url:
465 self._downloader.report_warning(u'avoiding MTV player')
466 xmldata_url = (
467 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
468 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
469 ) % video_id
470
471 # get enc data
472 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
473 enc_data_b = binascii.unhexlify(enc_data)
474 sk = self.__md5(
475 base64.b64decode(base64.b64decode(GK)) +
476 self.__md5(
477 str(video_id).encode('utf-8')
478 )
479 )
480 dec_data = self.__rc4crypt(enc_data_b, sk)
481
482 # extracting infos
483 self.report_extraction(video_id)
484
485 video_url = None
486 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
487 if mobj:
488 video_url = compat_urllib_parse.unquote(mobj.group(1))
489 if 'myvideo2flash' in video_url:
490 self._downloader.report_warning(u'forcing RTMPT ...')
491 video_url = video_url.replace('rtmpe://', 'rtmpt://')
492
493 if not video_url:
494 # extract non rtmp videos
495 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
496 if mobj is None:
497 raise ExtractorError(u'unable to extract url')
498 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
499
500 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
501 video_file = compat_urllib_parse.unquote(video_file)
502
503 if not video_file.endswith('f4m'):
504 ppath, prefix = video_file.split('.')
505 video_playpath = '%s:%s' % (prefix, ppath)
506 video_hls_playlist = ''
507 else:
508 video_playpath = ''
509 video_hls_playlist = (
510 video_filepath + video_file
511 ).replace('.f4m', '.m3u8')
512
513 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
514 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
515
516 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
517 webpage, u'title')
518
519 return [{
520 'id': video_id,
521 'url': video_url,
522 'tc_url': video_url,
523 'uploader': None,
524 'upload_date': None,
525 'title': video_title,
526 'ext': u'flv',
527 'play_path': video_playpath,
528 'video_file': video_file,
529 'video_hls_playlist': video_hls_playlist,
530 'player_url': video_swfobj,
531 }]
532
533
534 class ComedyCentralIE(InfoExtractor):
535 """Information extractor for The Daily Show and Colbert Report """
536
537 # urls can be abbreviations like :thedailyshow or :colbert
538 # urls for episodes like:
539 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
540 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
541 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
542 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
543 |(https?://)?(www\.)?
544 (?P<showname>thedailyshow|colbertnation)\.com/
545 (full-episodes/(?P<episode>.*)|
546 (?P<clip>
547 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
548 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
549 $"""
550
551 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
552
553 _video_extensions = {
554 '3500': 'mp4',
555 '2200': 'mp4',
556 '1700': 'mp4',
557 '1200': 'mp4',
558 '750': 'mp4',
559 '400': 'mp4',
560 }
561 _video_dimensions = {
562 '3500': '1280x720',
563 '2200': '960x540',
564 '1700': '768x432',
565 '1200': '640x360',
566 '750': '512x288',
567 '400': '384x216',
568 }
569
570 @classmethod
571 def suitable(cls, url):
572 """Receives a URL and returns True if suitable for this IE."""
573 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
574
575 def _print_formats(self, formats):
576 print('Available formats:')
577 for x in formats:
578 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
579
580
581 def _real_extract(self, url):
582 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
583 if mobj is None:
584 raise ExtractorError(u'Invalid URL: %s' % url)
585
586 if mobj.group('shortname'):
587 if mobj.group('shortname') in ('tds', 'thedailyshow'):
588 url = u'http://www.thedailyshow.com/full-episodes/'
589 else:
590 url = u'http://www.colbertnation.com/full-episodes/'
591 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
592 assert mobj is not None
593
594 if mobj.group('clip'):
595 if mobj.group('showname') == 'thedailyshow':
596 epTitle = mobj.group('tdstitle')
597 else:
598 epTitle = mobj.group('cntitle')
599 dlNewest = False
600 else:
601 dlNewest = not mobj.group('episode')
602 if dlNewest:
603 epTitle = mobj.group('showname')
604 else:
605 epTitle = mobj.group('episode')
606
607 self.report_extraction(epTitle)
608 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
609 if dlNewest:
610 url = htmlHandle.geturl()
611 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
612 if mobj is None:
613 raise ExtractorError(u'Invalid redirected URL: ' + url)
614 if mobj.group('episode') == '':
615 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
616 epTitle = mobj.group('episode')
617
618 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
619
620 if len(mMovieParams) == 0:
621 # The Colbert Report embeds the information in a without
622 # a URL prefix; so extract the alternate reference
623 # and then add the URL prefix manually.
624
625 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
626 if len(altMovieParams) == 0:
627 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
628 else:
629 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
630
631 uri = mMovieParams[0][1]
632 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
633 indexXml = self._download_webpage(indexUrl, epTitle,
634 u'Downloading show index',
635 u'unable to download episode index')
636
637 results = []
638
639 idoc = xml.etree.ElementTree.fromstring(indexXml)
640 itemEls = idoc.findall('.//item')
641 for partNum,itemEl in enumerate(itemEls):
642 mediaId = itemEl.findall('./guid')[0].text
643 shortMediaId = mediaId.split(':')[-1]
644 showId = mediaId.split(':')[-2].replace('.com', '')
645 officialTitle = itemEl.findall('./title')[0].text
646 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
647
648 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
649 compat_urllib_parse.urlencode({'uri': mediaId}))
650 configXml = self._download_webpage(configUrl, epTitle,
651 u'Downloading configuration for %s' % shortMediaId)
652
653 cdoc = xml.etree.ElementTree.fromstring(configXml)
654 turls = []
655 for rendition in cdoc.findall('.//rendition'):
656 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
657 turls.append(finfo)
658
659 if len(turls) == 0:
660 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
661 continue
662
663 if self._downloader.params.get('listformats', None):
664 self._print_formats([i[0] for i in turls])
665 return
666
667 # For now, just pick the highest bitrate
668 format,rtmp_video_url = turls[-1]
669
670 # Get the format arg from the arg stream
671 req_format = self._downloader.params.get('format', None)
672
673 # Select format if we can find one
674 for f,v in turls:
675 if f == req_format:
676 format, rtmp_video_url = f, v
677 break
678
679 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
680 if not m:
681 raise ExtractorError(u'Cannot transform RTMP url')
682 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
683 video_url = base + m.group('finalid')
684
685 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
686 info = {
687 'id': shortMediaId,
688 'url': video_url,
689 'uploader': showId,
690 'upload_date': officialDate,
691 'title': effTitle,
692 'ext': 'mp4',
693 'format': format,
694 'thumbnail': None,
695 'description': officialTitle,
696 }
697 results.append(info)
698
699 return results
700
701
702 class EscapistIE(InfoExtractor):
703 """Information extractor for The Escapist """
704
705 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
706 IE_NAME = u'escapist'
707
708 def _real_extract(self, url):
709 mobj = re.match(self._VALID_URL, url)
710 if mobj is None:
711 raise ExtractorError(u'Invalid URL: %s' % url)
712 showName = mobj.group('showname')
713 videoId = mobj.group('episode')
714
715 self.report_extraction(videoId)
716 webpage = self._download_webpage(url, videoId)
717
718 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
719 webpage, u'description', fatal=False)
720
721 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
722 webpage, u'thumbnail', fatal=False)
723
724 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
725 webpage, u'player url')
726
727 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
728 webpage, u'player url').split(' : ')[-1]
729
730 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
731 configUrl = compat_urllib_parse.unquote(configUrl)
732
733 configJSON = self._download_webpage(configUrl, videoId,
734 u'Downloading configuration',
735 u'unable to download configuration')
736
737 # Technically, it's JavaScript, not JSON
738 configJSON = configJSON.replace("'", '"')
739
740 try:
741 config = json.loads(configJSON)
742 except (ValueError,) as err:
743 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
744
745 playlist = config['playlist']
746 videoUrl = playlist[1]['url']
747
748 info = {
749 'id': videoId,
750 'url': videoUrl,
751 'uploader': showName,
752 'upload_date': None,
753 'title': title,
754 'ext': 'mp4',
755 'thumbnail': imgUrl,
756 'description': videoDesc,
757 'player_url': playerUrl,
758 }
759
760 return [info]
761
762 class CollegeHumorIE(InfoExtractor):
763 """Information extractor for collegehumor.com"""
764
765 _WORKING = False
766 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
767 IE_NAME = u'collegehumor'
768
769 def report_manifest(self, video_id):
770 """Report information extraction."""
771 self.to_screen(u'%s: Downloading XML manifest' % video_id)
772
773 def _real_extract(self, url):
774 mobj = re.match(self._VALID_URL, url)
775 if mobj is None:
776 raise ExtractorError(u'Invalid URL: %s' % url)
777 video_id = mobj.group('videoid')
778
779 info = {
780 'id': video_id,
781 'uploader': None,
782 'upload_date': None,
783 }
784
785 self.report_extraction(video_id)
786 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
787 try:
788 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
789 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
790 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
791
792 mdoc = xml.etree.ElementTree.fromstring(metaXml)
793 try:
794 videoNode = mdoc.findall('./video')[0]
795 info['description'] = videoNode.findall('./description')[0].text
796 info['title'] = videoNode.findall('./caption')[0].text
797 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
798 manifest_url = videoNode.findall('./file')[0].text
799 except IndexError:
800 raise ExtractorError(u'Invalid metadata XML file')
801
802 manifest_url += '?hdcore=2.10.3'
803 self.report_manifest(video_id)
804 try:
805 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
808
809 adoc = xml.etree.ElementTree.fromstring(manifestXml)
810 try:
811 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
812 node_id = media_node.attrib['url']
813 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
814 except IndexError as err:
815 raise ExtractorError(u'Invalid manifest file')
816
817 url_pr = compat_urllib_parse_urlparse(manifest_url)
818 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
819
820 info['url'] = url
821 info['ext'] = 'f4f'
822 return [info]
823
824
825 class XVideosIE(InfoExtractor):
826 """Information extractor for xvideos.com"""
827
828 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
829 IE_NAME = u'xvideos'
830
831 def _real_extract(self, url):
832 mobj = re.match(self._VALID_URL, url)
833 if mobj is None:
834 raise ExtractorError(u'Invalid URL: %s' % url)
835 video_id = mobj.group(1)
836
837 webpage = self._download_webpage(url, video_id)
838
839 self.report_extraction(video_id)
840
841 # Extract video URL
842 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
843 webpage, u'video URL'))
844
845 # Extract title
846 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
847 webpage, u'title')
848
849 # Extract video thumbnail
850 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
851 webpage, u'thumbnail', fatal=False)
852
853 info = {
854 'id': video_id,
855 'url': video_url,
856 'uploader': None,
857 'upload_date': None,
858 'title': video_title,
859 'ext': 'flv',
860 'thumbnail': video_thumbnail,
861 'description': None,
862 }
863
864 return [info]
865
866
867 class SoundcloudIE(InfoExtractor):
868 """Information extractor for soundcloud.com
869 To access the media, the uid of the song and a stream token
870 must be extracted from the page source and the script must make
871 a request to media.soundcloud.com/crossdomain.xml. Then
872 the media can be grabbed by requesting from an url composed
873 of the stream token and uid
874 """
875
876 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
877 IE_NAME = u'soundcloud'
878
879 def report_resolve(self, video_id):
880 """Report information extraction."""
881 self.to_screen(u'%s: Resolving id' % video_id)
882
883 def _real_extract(self, url):
884 mobj = re.match(self._VALID_URL, url)
885 if mobj is None:
886 raise ExtractorError(u'Invalid URL: %s' % url)
887
888 # extract uploader (which is in the url)
889 uploader = mobj.group(1)
890 # extract simple title (uploader + slug of song title)
891 slug_title = mobj.group(2)
892 simple_title = uploader + u'-' + slug_title
893 full_title = '%s/%s' % (uploader, slug_title)
894
895 self.report_resolve(full_title)
896
897 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
898 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
899 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
900
901 info = json.loads(info_json)
902 video_id = info['id']
903 self.report_extraction(full_title)
904
905 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
906 stream_json = self._download_webpage(streams_url, full_title,
907 u'Downloading stream definitions',
908 u'unable to download stream definitions')
909
910 streams = json.loads(stream_json)
911 mediaURL = streams['http_mp3_128_url']
912 upload_date = unified_strdate(info['created_at'])
913
914 return [{
915 'id': info['id'],
916 'url': mediaURL,
917 'uploader': info['user']['username'],
918 'upload_date': upload_date,
919 'title': info['title'],
920 'ext': u'mp3',
921 'description': info['description'],
922 }]
923
924 class SoundcloudSetIE(InfoExtractor):
925 """Information extractor for soundcloud.com sets
926 To access the media, the uid of the song and a stream token
927 must be extracted from the page source and the script must make
928 a request to media.soundcloud.com/crossdomain.xml. Then
929 the media can be grabbed by requesting from an url composed
930 of the stream token and uid
931 """
932
933 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
934 IE_NAME = u'soundcloud:set'
935
936 def report_resolve(self, video_id):
937 """Report information extraction."""
938 self.to_screen(u'%s: Resolving id' % video_id)
939
940 def _real_extract(self, url):
941 mobj = re.match(self._VALID_URL, url)
942 if mobj is None:
943 raise ExtractorError(u'Invalid URL: %s' % url)
944
945 # extract uploader (which is in the url)
946 uploader = mobj.group(1)
947 # extract simple title (uploader + slug of song title)
948 slug_title = mobj.group(2)
949 simple_title = uploader + u'-' + slug_title
950 full_title = '%s/sets/%s' % (uploader, slug_title)
951
952 self.report_resolve(full_title)
953
954 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
955 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
956 info_json = self._download_webpage(resolv_url, full_title)
957
958 videos = []
959 info = json.loads(info_json)
960 if 'errors' in info:
961 for err in info['errors']:
962 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
963 return
964
965 self.report_extraction(full_title)
966 for track in info['tracks']:
967 video_id = track['id']
968
969 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
970 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
971
972 self.report_extraction(video_id)
973 streams = json.loads(stream_json)
974 mediaURL = streams['http_mp3_128_url']
975
976 videos.append({
977 'id': video_id,
978 'url': mediaURL,
979 'uploader': track['user']['username'],
980 'upload_date': unified_strdate(track['created_at']),
981 'title': track['title'],
982 'ext': u'mp3',
983 'description': track['description'],
984 })
985 return videos
986
987
988 class InfoQIE(InfoExtractor):
989 """Information extractor for infoq.com"""
990 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
991
992 def _real_extract(self, url):
993 mobj = re.match(self._VALID_URL, url)
994 if mobj is None:
995 raise ExtractorError(u'Invalid URL: %s' % url)
996
997 webpage = self._download_webpage(url, video_id=url)
998 self.report_extraction(url)
999
1000 # Extract video URL
1001 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1002 if mobj is None:
1003 raise ExtractorError(u'Unable to extract video url')
1004 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1005 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1006
1007 # Extract title
1008 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1009 webpage, u'title')
1010
1011 # Extract description
1012 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1013 webpage, u'description', fatal=False)
1014
1015 video_filename = video_url.split('/')[-1]
1016 video_id, extension = video_filename.split('.')
1017
1018 info = {
1019 'id': video_id,
1020 'url': video_url,
1021 'uploader': None,
1022 'upload_date': None,
1023 'title': video_title,
1024 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1025 'thumbnail': None,
1026 'description': video_description,
1027 }
1028
1029 return [info]
1030
1031 class MixcloudIE(InfoExtractor):
1032 """Information extractor for www.mixcloud.com"""
1033
1034 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1035 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1036 IE_NAME = u'mixcloud'
1037
1038 def report_download_json(self, file_id):
1039 """Report JSON download."""
1040 self.to_screen(u'Downloading json')
1041
1042 def get_urls(self, jsonData, fmt, bitrate='best'):
1043 """Get urls from 'audio_formats' section in json"""
1044 file_url = None
1045 try:
1046 bitrate_list = jsonData[fmt]
1047 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1048 bitrate = max(bitrate_list) # select highest
1049
1050 url_list = jsonData[fmt][bitrate]
1051 except TypeError: # we have no bitrate info.
1052 url_list = jsonData[fmt]
1053 return url_list
1054
1055 def check_urls(self, url_list):
1056 """Returns 1st active url from list"""
1057 for url in url_list:
1058 try:
1059 compat_urllib_request.urlopen(url)
1060 return url
1061 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1062 url = None
1063
1064 return None
1065
1066 def _print_formats(self, formats):
1067 print('Available formats:')
1068 for fmt in formats.keys():
1069 for b in formats[fmt]:
1070 try:
1071 ext = formats[fmt][b][0]
1072 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1073 except TypeError: # we have no bitrate info
1074 ext = formats[fmt][0]
1075 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1076 break
1077
1078 def _real_extract(self, url):
1079 mobj = re.match(self._VALID_URL, url)
1080 if mobj is None:
1081 raise ExtractorError(u'Invalid URL: %s' % url)
1082 # extract uploader & filename from url
1083 uploader = mobj.group(1).decode('utf-8')
1084 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1085
1086 # construct API request
1087 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1088 # retrieve .json file with links to files
1089 request = compat_urllib_request.Request(file_url)
1090 try:
1091 self.report_download_json(file_url)
1092 jsonData = compat_urllib_request.urlopen(request).read()
1093 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1094 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1095
1096 # parse JSON
1097 json_data = json.loads(jsonData)
1098 player_url = json_data['player_swf_url']
1099 formats = dict(json_data['audio_formats'])
1100
1101 req_format = self._downloader.params.get('format', None)
1102 bitrate = None
1103
1104 if self._downloader.params.get('listformats', None):
1105 self._print_formats(formats)
1106 return
1107
1108 if req_format is None or req_format == 'best':
1109 for format_param in formats.keys():
1110 url_list = self.get_urls(formats, format_param)
1111 # check urls
1112 file_url = self.check_urls(url_list)
1113 if file_url is not None:
1114 break # got it!
1115 else:
1116 if req_format not in formats:
1117 raise ExtractorError(u'Format is not available')
1118
1119 url_list = self.get_urls(formats, req_format)
1120 file_url = self.check_urls(url_list)
1121 format_param = req_format
1122
1123 return [{
1124 'id': file_id.decode('utf-8'),
1125 'url': file_url.decode('utf-8'),
1126 'uploader': uploader.decode('utf-8'),
1127 'upload_date': None,
1128 'title': json_data['name'],
1129 'ext': file_url.split('.')[-1].decode('utf-8'),
1130 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1131 'thumbnail': json_data['thumbnail_url'],
1132 'description': json_data['description'],
1133 'player_url': player_url.decode('utf-8'),
1134 }]
1135
1136 class StanfordOpenClassroomIE(InfoExtractor):
1137 """Information extractor for Stanford's Open ClassRoom"""
1138
1139 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1140 IE_NAME = u'stanfordoc'
1141
1142 def _real_extract(self, url):
1143 mobj = re.match(self._VALID_URL, url)
1144 if mobj is None:
1145 raise ExtractorError(u'Invalid URL: %s' % url)
1146
1147 if mobj.group('course') and mobj.group('video'): # A specific video
1148 course = mobj.group('course')
1149 video = mobj.group('video')
1150 info = {
1151 'id': course + '_' + video,
1152 'uploader': None,
1153 'upload_date': None,
1154 }
1155
1156 self.report_extraction(info['id'])
1157 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1158 xmlUrl = baseUrl + video + '.xml'
1159 try:
1160 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1163 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1164 try:
1165 info['title'] = mdoc.findall('./title')[0].text
1166 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1167 except IndexError:
1168 raise ExtractorError(u'Invalid metadata XML file')
1169 info['ext'] = info['url'].rpartition('.')[2]
1170 return [info]
1171 elif mobj.group('course'): # A course page
1172 course = mobj.group('course')
1173 info = {
1174 'id': course,
1175 'type': 'playlist',
1176 'uploader': None,
1177 'upload_date': None,
1178 }
1179
1180 coursepage = self._download_webpage(url, info['id'],
1181 note='Downloading course info page',
1182 errnote='Unable to download course info page')
1183
1184 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1185
1186 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1187 coursepage, u'description', fatal=False)
1188
1189 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1190 info['list'] = [
1191 {
1192 'type': 'reference',
1193 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1194 }
1195 for vpage in links]
1196 results = []
1197 for entry in info['list']:
1198 assert entry['type'] == 'reference'
1199 results += self.extract(entry['url'])
1200 return results
1201 else: # Root page
1202 info = {
1203 'id': 'Stanford OpenClassroom',
1204 'type': 'playlist',
1205 'uploader': None,
1206 'upload_date': None,
1207 }
1208
1209 self.report_download_webpage(info['id'])
1210 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1211 try:
1212 rootpage = compat_urllib_request.urlopen(rootURL).read()
1213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1214 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1215
1216 info['title'] = info['id']
1217
1218 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1219 info['list'] = [
1220 {
1221 'type': 'reference',
1222 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1223 }
1224 for cpage in links]
1225
1226 results = []
1227 for entry in info['list']:
1228 assert entry['type'] == 'reference'
1229 results += self.extract(entry['url'])
1230 return results
1231
1232 class MTVIE(InfoExtractor):
1233 """Information extractor for MTV.com"""
1234
1235 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1236 IE_NAME = u'mtv'
1237
1238 def _real_extract(self, url):
1239 mobj = re.match(self._VALID_URL, url)
1240 if mobj is None:
1241 raise ExtractorError(u'Invalid URL: %s' % url)
1242 if not mobj.group('proto'):
1243 url = 'http://' + url
1244 video_id = mobj.group('videoid')
1245
1246 webpage = self._download_webpage(url, video_id)
1247
1248 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1249 webpage, u'song name', fatal=False)
1250
1251 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1252 webpage, u'title')
1253
1254 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1255 webpage, u'mtvn_uri', fatal=False)
1256
1257 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1258 webpage, u'content id', fatal=False)
1259
1260 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1261 self.report_extraction(video_id)
1262 request = compat_urllib_request.Request(videogen_url)
1263 try:
1264 metadataXml = compat_urllib_request.urlopen(request).read()
1265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1267
1268 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1269 renditions = mdoc.findall('.//rendition')
1270
1271 # For now, always pick the highest quality.
1272 rendition = renditions[-1]
1273
1274 try:
1275 _,_,ext = rendition.attrib['type'].partition('/')
1276 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1277 video_url = rendition.find('./src').text
1278 except KeyError:
1279 raise ExtractorError('Invalid rendition field.')
1280
1281 info = {
1282 'id': video_id,
1283 'url': video_url,
1284 'uploader': performer,
1285 'upload_date': None,
1286 'title': video_title,
1287 'ext': ext,
1288 'format': format,
1289 }
1290
1291 return [info]
1292
1293
1294 class YoukuIE(InfoExtractor):
1295 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1296
1297 def _gen_sid(self):
1298 nowTime = int(time.time() * 1000)
1299 random1 = random.randint(1000,1998)
1300 random2 = random.randint(1000,9999)
1301
1302 return "%d%d%d" %(nowTime,random1,random2)
1303
1304 def _get_file_ID_mix_string(self, seed):
1305 mixed = []
1306 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1307 seed = float(seed)
1308 for i in range(len(source)):
1309 seed = (seed * 211 + 30031 ) % 65536
1310 index = math.floor(seed / 65536 * len(source) )
1311 mixed.append(source[int(index)])
1312 source.remove(source[int(index)])
1313 #return ''.join(mixed)
1314 return mixed
1315
1316 def _get_file_id(self, fileId, seed):
1317 mixed = self._get_file_ID_mix_string(seed)
1318 ids = fileId.split('*')
1319 realId = []
1320 for ch in ids:
1321 if ch:
1322 realId.append(mixed[int(ch)])
1323 return ''.join(realId)
1324
1325 def _real_extract(self, url):
1326 mobj = re.match(self._VALID_URL, url)
1327 if mobj is None:
1328 raise ExtractorError(u'Invalid URL: %s' % url)
1329 video_id = mobj.group('ID')
1330
1331 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1332
1333 jsondata = self._download_webpage(info_url, video_id)
1334
1335 self.report_extraction(video_id)
1336 try:
1337 config = json.loads(jsondata)
1338
1339 video_title = config['data'][0]['title']
1340 seed = config['data'][0]['seed']
1341
1342 format = self._downloader.params.get('format', None)
1343 supported_format = list(config['data'][0]['streamfileids'].keys())
1344
1345 if format is None or format == 'best':
1346 if 'hd2' in supported_format:
1347 format = 'hd2'
1348 else:
1349 format = 'flv'
1350 ext = u'flv'
1351 elif format == 'worst':
1352 format = 'mp4'
1353 ext = u'mp4'
1354 else:
1355 format = 'flv'
1356 ext = u'flv'
1357
1358
1359 fileid = config['data'][0]['streamfileids'][format]
1360 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1361 except (UnicodeDecodeError, ValueError, KeyError):
1362 raise ExtractorError(u'Unable to extract info section')
1363
1364 files_info=[]
1365 sid = self._gen_sid()
1366 fileid = self._get_file_id(fileid, seed)
1367
1368 #column 8,9 of fileid represent the segment number
1369 #fileid[7:9] should be changed
1370 for index, key in enumerate(keys):
1371
1372 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1373 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1374
1375 info = {
1376 'id': '%s_part%02d' % (video_id, index),
1377 'url': download_url,
1378 'uploader': None,
1379 'upload_date': None,
1380 'title': video_title,
1381 'ext': ext,
1382 }
1383 files_info.append(info)
1384
1385 return files_info
1386
1387
1388 class XNXXIE(InfoExtractor):
1389 """Information extractor for xnxx.com"""
1390
1391 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1392 IE_NAME = u'xnxx'
1393 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1394 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1395 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1396
1397 def _real_extract(self, url):
1398 mobj = re.match(self._VALID_URL, url)
1399 if mobj is None:
1400 raise ExtractorError(u'Invalid URL: %s' % url)
1401 video_id = mobj.group(1)
1402
1403 # Get webpage content
1404 webpage = self._download_webpage(url, video_id)
1405
1406 video_url = self._search_regex(self.VIDEO_URL_RE,
1407 webpage, u'video URL')
1408 video_url = compat_urllib_parse.unquote(video_url)
1409
1410 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1411 webpage, u'title')
1412
1413 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1414 webpage, u'thumbnail', fatal=False)
1415
1416 return [{
1417 'id': video_id,
1418 'url': video_url,
1419 'uploader': None,
1420 'upload_date': None,
1421 'title': video_title,
1422 'ext': 'flv',
1423 'thumbnail': video_thumbnail,
1424 'description': None,
1425 }]
1426
1427
1428 class GooglePlusIE(InfoExtractor):
1429 """Information extractor for plus.google.com."""
1430
1431 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1432 IE_NAME = u'plus.google'
1433
1434 def _real_extract(self, url):
1435 # Extract id from URL
1436 mobj = re.match(self._VALID_URL, url)
1437 if mobj is None:
1438 raise ExtractorError(u'Invalid URL: %s' % url)
1439
1440 post_url = mobj.group(0)
1441 video_id = mobj.group(1)
1442
1443 video_extension = 'flv'
1444
1445 # Step 1, Retrieve post webpage to extract further information
1446 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1447
1448 self.report_extraction(video_id)
1449
1450 # Extract update date
1451 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1452 webpage, u'upload date', fatal=False)
1453 if upload_date:
1454 # Convert timestring to a format suitable for filename
1455 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1456 upload_date = upload_date.strftime('%Y%m%d')
1457
1458 # Extract uploader
1459 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1460 webpage, u'uploader', fatal=False)
1461
1462 # Extract title
1463 # Get the first line for title
1464 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1465 webpage, 'title', default=u'NA')
1466
1467 # Step 2, Stimulate clicking the image box to launch video
1468 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1469 webpage, u'video page URL')
1470 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1471
1472 # Extract video links on video page
1473 """Extract video links of all sizes"""
1474 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1475 mobj = re.findall(pattern, webpage)
1476 if len(mobj) == 0:
1477 raise ExtractorError(u'Unable to extract video links')
1478
1479 # Sort in resolution
1480 links = sorted(mobj)
1481
1482 # Choose the lowest of the sort, i.e. highest resolution
1483 video_url = links[-1]
1484 # Only get the url. The resolution part in the tuple has no use anymore
1485 video_url = video_url[-1]
1486 # Treat escaped \u0026 style hex
1487 try:
1488 video_url = video_url.decode("unicode_escape")
1489 except AttributeError: # Python 3
1490 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1491
1492
1493 return [{
1494 'id': video_id,
1495 'url': video_url,
1496 'uploader': uploader,
1497 'upload_date': upload_date,
1498 'title': video_title,
1499 'ext': video_extension,
1500 }]
1501
1502 class NBAIE(InfoExtractor):
1503 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1504 IE_NAME = u'nba'
1505
1506 def _real_extract(self, url):
1507 mobj = re.match(self._VALID_URL, url)
1508 if mobj is None:
1509 raise ExtractorError(u'Invalid URL: %s' % url)
1510
1511 video_id = mobj.group(1)
1512
1513 webpage = self._download_webpage(url, video_id)
1514
1515 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1516
1517 shortened_video_id = video_id.rpartition('/')[2]
1518 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1519 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1520
1521 # It isn't there in the HTML it returns to us
1522 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1523
1524 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1525
1526 info = {
1527 'id': shortened_video_id,
1528 'url': video_url,
1529 'ext': 'mp4',
1530 'title': title,
1531 # 'uploader_date': uploader_date,
1532 'description': description,
1533 }
1534 return [info]
1535
1536 class JustinTVIE(InfoExtractor):
1537 """Information extractor for justin.tv and twitch.tv"""
1538 # TODO: One broadcast may be split into multiple videos. The key
1539 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1540 # starts at 1 and increases. Can we treat all parts as one video?
1541
1542 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1543 (?:
1544 (?P<channelid>[^/]+)|
1545 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1546 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1547 )
1548 /?(?:\#.*)?$
1549 """
1550 _JUSTIN_PAGE_LIMIT = 100
1551 IE_NAME = u'justin.tv'
1552
1553 def report_download_page(self, channel, offset):
1554 """Report attempt to download a single page of videos."""
1555 self.to_screen(u'%s: Downloading video information from %d to %d' %
1556 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1557
1558 # Return count of items, list of *valid* items
1559 def _parse_page(self, url, video_id):
1560 webpage = self._download_webpage(url, video_id,
1561 u'Downloading video info JSON',
1562 u'unable to download video info JSON')
1563
1564 response = json.loads(webpage)
1565 if type(response) != list:
1566 error_text = response.get('error', 'unknown error')
1567 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1568 info = []
1569 for clip in response:
1570 video_url = clip['video_file_url']
1571 if video_url:
1572 video_extension = os.path.splitext(video_url)[1][1:]
1573 video_date = re.sub('-', '', clip['start_time'][:10])
1574 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1575 video_id = clip['id']
1576 video_title = clip.get('title', video_id)
1577 info.append({
1578 'id': video_id,
1579 'url': video_url,
1580 'title': video_title,
1581 'uploader': clip.get('channel_name', video_uploader_id),
1582 'uploader_id': video_uploader_id,
1583 'upload_date': video_date,
1584 'ext': video_extension,
1585 })
1586 return (len(response), info)
1587
1588 def _real_extract(self, url):
1589 mobj = re.match(self._VALID_URL, url)
1590 if mobj is None:
1591 raise ExtractorError(u'invalid URL: %s' % url)
1592
1593 api_base = 'http://api.justin.tv'
1594 paged = False
1595 if mobj.group('channelid'):
1596 paged = True
1597 video_id = mobj.group('channelid')
1598 api = api_base + '/channel/archives/%s.json' % video_id
1599 elif mobj.group('chapterid'):
1600 chapter_id = mobj.group('chapterid')
1601
1602 webpage = self._download_webpage(url, chapter_id)
1603 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1604 if not m:
1605 raise ExtractorError(u'Cannot find archive of a chapter')
1606 archive_id = m.group(1)
1607
1608 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1609 chapter_info_xml = self._download_webpage(api, chapter_id,
1610 note=u'Downloading chapter information',
1611 errnote=u'Chapter information download failed')
1612 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1613 for a in doc.findall('.//archive'):
1614 if archive_id == a.find('./id').text:
1615 break
1616 else:
1617 raise ExtractorError(u'Could not find chapter in chapter information')
1618
1619 video_url = a.find('./video_file_url').text
1620 video_ext = video_url.rpartition('.')[2] or u'flv'
1621
1622 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1623 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1624 note='Downloading chapter metadata',
1625 errnote='Download of chapter metadata failed')
1626 chapter_info = json.loads(chapter_info_json)
1627
1628 bracket_start = int(doc.find('.//bracket_start').text)
1629 bracket_end = int(doc.find('.//bracket_end').text)
1630
1631 # TODO determine start (and probably fix up file)
1632 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1633 #video_url += u'?start=' + TODO:start_timestamp
1634 # bracket_start is 13290, but we want 51670615
1635 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1636 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1637
1638 info = {
1639 'id': u'c' + chapter_id,
1640 'url': video_url,
1641 'ext': video_ext,
1642 'title': chapter_info['title'],
1643 'thumbnail': chapter_info['preview'],
1644 'description': chapter_info['description'],
1645 'uploader': chapter_info['channel']['display_name'],
1646 'uploader_id': chapter_info['channel']['name'],
1647 }
1648 return [info]
1649 else:
1650 video_id = mobj.group('videoid')
1651 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1652
1653 self.report_extraction(video_id)
1654
1655 info = []
1656 offset = 0
1657 limit = self._JUSTIN_PAGE_LIMIT
1658 while True:
1659 if paged:
1660 self.report_download_page(video_id, offset)
1661 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1662 page_count, page_info = self._parse_page(page_url, video_id)
1663 info.extend(page_info)
1664 if not paged or page_count != limit:
1665 break
1666 offset += limit
1667 return info
1668
1669 class FunnyOrDieIE(InfoExtractor):
1670 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1671
1672 def _real_extract(self, url):
1673 mobj = re.match(self._VALID_URL, url)
1674 if mobj is None:
1675 raise ExtractorError(u'invalid URL: %s' % url)
1676
1677 video_id = mobj.group('id')
1678 webpage = self._download_webpage(url, video_id)
1679
1680 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1681 webpage, u'video URL', flags=re.DOTALL)
1682
1683 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1684 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1685
1686 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1687 webpage, u'description', fatal=False, flags=re.DOTALL)
1688
1689 info = {
1690 'id': video_id,
1691 'url': video_url,
1692 'ext': 'mp4',
1693 'title': title,
1694 'description': video_description,
1695 }
1696 return [info]
1697
1698 class SteamIE(InfoExtractor):
1699 _VALID_URL = r"""http://store\.steampowered\.com/
1700 (agecheck/)?
1701 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1702 (?P<gameID>\d+)/?
1703 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1704 """
1705 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1706 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1707
1708 @classmethod
1709 def suitable(cls, url):
1710 """Receives a URL and returns True if suitable for this IE."""
1711 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1712
1713 def _real_extract(self, url):
1714 m = re.match(self._VALID_URL, url, re.VERBOSE)
1715 gameID = m.group('gameID')
1716
1717 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1718 webpage = self._download_webpage(videourl, gameID)
1719
1720 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1721 videourl = self._AGECHECK_TEMPLATE % gameID
1722 self.report_age_confirmation()
1723 webpage = self._download_webpage(videourl, gameID)
1724
1725 self.report_extraction(gameID)
1726 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1727 webpage, 'game title')
1728
1729 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1730 mweb = re.finditer(urlRE, webpage)
1731 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1732 titles = re.finditer(namesRE, webpage)
1733 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1734 thumbs = re.finditer(thumbsRE, webpage)
1735 videos = []
1736 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1737 video_id = vid.group('videoID')
1738 title = vtitle.group('videoName')
1739 video_url = vid.group('videoURL')
1740 video_thumb = thumb.group('thumbnail')
1741 if not video_url:
1742 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1743 info = {
1744 'id':video_id,
1745 'url':video_url,
1746 'ext': 'flv',
1747 'title': unescapeHTML(title),
1748 'thumbnail': video_thumb
1749 }
1750 videos.append(info)
1751 return [self.playlist_result(videos, gameID, game_title)]
1752
1753 class UstreamIE(InfoExtractor):
1754 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1755 IE_NAME = u'ustream'
1756
1757 def _real_extract(self, url):
1758 m = re.match(self._VALID_URL, url)
1759 video_id = m.group('videoID')
1760
1761 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1762 webpage = self._download_webpage(url, video_id)
1763
1764 self.report_extraction(video_id)
1765
1766 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1767 webpage, u'title')
1768
1769 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1770 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1771
1772 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1773 webpage, u'thumbnail', fatal=False)
1774
1775 info = {
1776 'id': video_id,
1777 'url': video_url,
1778 'ext': 'flv',
1779 'title': video_title,
1780 'uploader': uploader,
1781 'thumbnail': thumbnail,
1782 }
1783 return info
1784
1785 class WorldStarHipHopIE(InfoExtractor):
1786 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1787 IE_NAME = u'WorldStarHipHop'
1788
1789 def _real_extract(self, url):
1790 m = re.match(self._VALID_URL, url)
1791 video_id = m.group('id')
1792
1793 webpage_src = self._download_webpage(url, video_id)
1794
1795 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1796 webpage_src, u'video URL')
1797
1798 if 'mp4' in video_url:
1799 ext = 'mp4'
1800 else:
1801 ext = 'flv'
1802
1803 video_title = self._html_search_regex(r"<title>(.*)</title>",
1804 webpage_src, u'title')
1805
1806 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1807 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1808 webpage_src, u'thumbnail', fatal=False)
1809
1810 if not thumbnail:
1811 _title = r"""candytitles.*>(.*)</span>"""
1812 mobj = re.search(_title, webpage_src)
1813 if mobj is not None:
1814 video_title = mobj.group(1)
1815
1816 results = [{
1817 'id': video_id,
1818 'url' : video_url,
1819 'title' : video_title,
1820 'thumbnail' : thumbnail,
1821 'ext' : ext,
1822 }]
1823 return results
1824
1825 class RBMARadioIE(InfoExtractor):
1826 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1827
1828 def _real_extract(self, url):
1829 m = re.match(self._VALID_URL, url)
1830 video_id = m.group('videoID')
1831
1832 webpage = self._download_webpage(url, video_id)
1833
1834 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1835 webpage, u'json data', flags=re.MULTILINE)
1836
1837 try:
1838 data = json.loads(json_data)
1839 except ValueError as e:
1840 raise ExtractorError(u'Invalid JSON: ' + str(e))
1841
1842 video_url = data['akamai_url'] + '&cbr=256'
1843 url_parts = compat_urllib_parse_urlparse(video_url)
1844 video_ext = url_parts.path.rpartition('.')[2]
1845 info = {
1846 'id': video_id,
1847 'url': video_url,
1848 'ext': video_ext,
1849 'title': data['title'],
1850 'description': data.get('teaser_text'),
1851 'location': data.get('country_of_origin'),
1852 'uploader': data.get('host', {}).get('name'),
1853 'uploader_id': data.get('host', {}).get('slug'),
1854 'thumbnail': data.get('image', {}).get('large_url_2x'),
1855 'duration': data.get('duration'),
1856 }
1857 return [info]
1858
1859
1860 class YouPornIE(InfoExtractor):
1861 """Information extractor for youporn.com."""
1862 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1863
1864 def _print_formats(self, formats):
1865 """Print all available formats"""
1866 print(u'Available formats:')
1867 print(u'ext\t\tformat')
1868 print(u'---------------------------------')
1869 for format in formats:
1870 print(u'%s\t\t%s' % (format['ext'], format['format']))
1871
1872 def _specific(self, req_format, formats):
1873 for x in formats:
1874 if(x["format"]==req_format):
1875 return x
1876 return None
1877
1878 def _real_extract(self, url):
1879 mobj = re.match(self._VALID_URL, url)
1880 if mobj is None:
1881 raise ExtractorError(u'Invalid URL: %s' % url)
1882 video_id = mobj.group('videoid')
1883
1884 req = compat_urllib_request.Request(url)
1885 req.add_header('Cookie', 'age_verified=1')
1886 webpage = self._download_webpage(req, video_id)
1887
1888 # Get JSON parameters
1889 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1890 try:
1891 params = json.loads(json_params)
1892 except:
1893 raise ExtractorError(u'Invalid JSON')
1894
1895 self.report_extraction(video_id)
1896 try:
1897 video_title = params['title']
1898 upload_date = unified_strdate(params['release_date_f'])
1899 video_description = params['description']
1900 video_uploader = params['submitted_by']
1901 thumbnail = params['thumbnails'][0]['image']
1902 except KeyError:
1903 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1904
1905 # Get all of the formats available
1906 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1907 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1908 webpage, u'download list').strip()
1909
1910 # Get all of the links from the page
1911 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1912 links = re.findall(LINK_RE, download_list_html)
1913 if(len(links) == 0):
1914 raise ExtractorError(u'ERROR: no known formats available for video')
1915
1916 self.to_screen(u'Links found: %d' % len(links))
1917
1918 formats = []
1919 for link in links:
1920
1921 # A link looks like this:
1922 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1923 # A path looks like this:
1924 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1925 video_url = unescapeHTML( link )
1926 path = compat_urllib_parse_urlparse( video_url ).path
1927 extension = os.path.splitext( path )[1][1:]
1928 format = path.split('/')[4].split('_')[:2]
1929 size = format[0]
1930 bitrate = format[1]
1931 format = "-".join( format )
1932 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1933
1934 formats.append({
1935 'id': video_id,
1936 'url': video_url,
1937 'uploader': video_uploader,
1938 'upload_date': upload_date,
1939 'title': video_title,
1940 'ext': extension,
1941 'format': format,
1942 'thumbnail': thumbnail,
1943 'description': video_description
1944 })
1945
1946 if self._downloader.params.get('listformats', None):
1947 self._print_formats(formats)
1948 return
1949
1950 req_format = self._downloader.params.get('format', None)
1951 self.to_screen(u'Format: %s' % req_format)
1952
1953 if req_format is None or req_format == 'best':
1954 return [formats[0]]
1955 elif req_format == 'worst':
1956 return [formats[-1]]
1957 elif req_format in ('-1', 'all'):
1958 return formats
1959 else:
1960 format = self._specific( req_format, formats )
1961 if result is None:
1962 raise ExtractorError(u'Requested format not available')
1963 return [format]
1964
1965
1966
1967 class PornotubeIE(InfoExtractor):
1968 """Information extractor for pornotube.com."""
1969 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1970
1971 def _real_extract(self, url):
1972 mobj = re.match(self._VALID_URL, url)
1973 if mobj is None:
1974 raise ExtractorError(u'Invalid URL: %s' % url)
1975
1976 video_id = mobj.group('videoid')
1977 video_title = mobj.group('title')
1978
1979 # Get webpage content
1980 webpage = self._download_webpage(url, video_id)
1981
1982 # Get the video URL
1983 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1984 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1985 video_url = compat_urllib_parse.unquote(video_url)
1986
1987 #Get the uploaded date
1988 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1989 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1990 if upload_date: upload_date = unified_strdate(upload_date)
1991
1992 info = {'id': video_id,
1993 'url': video_url,
1994 'uploader': None,
1995 'upload_date': upload_date,
1996 'title': video_title,
1997 'ext': 'flv',
1998 'format': 'flv'}
1999
2000 return [info]
2001
2002 class YouJizzIE(InfoExtractor):
2003 """Information extractor for youjizz.com."""
2004 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2005
2006 def _real_extract(self, url):
2007 mobj = re.match(self._VALID_URL, url)
2008 if mobj is None:
2009 raise ExtractorError(u'Invalid URL: %s' % url)
2010
2011 video_id = mobj.group('videoid')
2012
2013 # Get webpage content
2014 webpage = self._download_webpage(url, video_id)
2015
2016 # Get the video title
2017 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2018 webpage, u'title').strip()
2019
2020 # Get the embed page
2021 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2022 if result is None:
2023 raise ExtractorError(u'ERROR: unable to extract embed page')
2024
2025 embed_page_url = result.group(0).strip()
2026 video_id = result.group('videoid')
2027
2028 webpage = self._download_webpage(embed_page_url, video_id)
2029
2030 # Get the video URL
2031 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2032 webpage, u'video URL')
2033
2034 info = {'id': video_id,
2035 'url': video_url,
2036 'title': video_title,
2037 'ext': 'flv',
2038 'format': 'flv',
2039 'player_url': embed_page_url}
2040
2041 return [info]
2042
2043 class EightTracksIE(InfoExtractor):
2044 IE_NAME = '8tracks'
2045 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2046
2047 def _real_extract(self, url):
2048 mobj = re.match(self._VALID_URL, url)
2049 if mobj is None:
2050 raise ExtractorError(u'Invalid URL: %s' % url)
2051 playlist_id = mobj.group('id')
2052
2053 webpage = self._download_webpage(url, playlist_id)
2054
2055 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2056 data = json.loads(json_like)
2057
2058 session = str(random.randint(0, 1000000000))
2059 mix_id = data['id']
2060 track_count = data['tracks_count']
2061 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2062 next_url = first_url
2063 res = []
2064 for i in itertools.count():
2065 api_json = self._download_webpage(next_url, playlist_id,
2066 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2067 errnote=u'Failed to download song information')
2068 api_data = json.loads(api_json)
2069 track_data = api_data[u'set']['track']
2070 info = {
2071 'id': track_data['id'],
2072 'url': track_data['track_file_stream_url'],
2073 'title': track_data['performer'] + u' - ' + track_data['name'],
2074 'raw_title': track_data['name'],
2075 'uploader_id': data['user']['login'],
2076 'ext': 'm4a',
2077 }
2078 res.append(info)
2079 if api_data['set']['at_last_track']:
2080 break
2081 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2082 return res
2083
2084 class KeekIE(InfoExtractor):
2085 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2086 IE_NAME = u'keek'
2087
2088 def _real_extract(self, url):
2089 m = re.match(self._VALID_URL, url)
2090 video_id = m.group('videoID')
2091
2092 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2093 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2094 webpage = self._download_webpage(url, video_id)
2095
2096 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2097 webpage, u'title')
2098
2099 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2100 webpage, u'uploader', fatal=False)
2101
2102 info = {
2103 'id': video_id,
2104 'url': video_url,
2105 'ext': 'mp4',
2106 'title': video_title,
2107 'thumbnail': thumbnail,
2108 'uploader': uploader
2109 }
2110 return [info]
2111
2112 class TEDIE(InfoExtractor):
2113 _VALID_URL=r'''http://www\.ted\.com/
2114 (
2115 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2116 |
2117 ((?P<type_talk>talks)) # We have a simple talk
2118 )
2119 (/lang/(.*?))? # The url may contain the language
2120 /(?P<name>\w+) # Here goes the name and then ".html"
2121 '''
2122
2123 @classmethod
2124 def suitable(cls, url):
2125 """Receives a URL and returns True if suitable for this IE."""
2126 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2127
2128 def _real_extract(self, url):
2129 m=re.match(self._VALID_URL, url, re.VERBOSE)
2130 if m.group('type_talk'):
2131 return [self._talk_info(url)]
2132 else :
2133 playlist_id=m.group('playlist_id')
2134 name=m.group('name')
2135 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2136 return [self._playlist_videos_info(url,name,playlist_id)]
2137
2138 def _playlist_videos_info(self,url,name,playlist_id=0):
2139 '''Returns the videos of the playlist'''
2140 video_RE=r'''
2141 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2142 ([.\s]*?)data-playlist_item_id="(\d+)"
2143 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2144 '''
2145 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2146 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2147 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2148 m_names=re.finditer(video_name_RE,webpage)
2149
2150 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2151 webpage, 'playlist title')
2152
2153 playlist_entries = []
2154 for m_video, m_name in zip(m_videos,m_names):
2155 video_id=m_video.group('video_id')
2156 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2157 playlist_entries.append(self.url_result(talk_url, 'TED'))
2158 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2159
2160 def _talk_info(self, url, video_id=0):
2161 """Return the video for the talk in the url"""
2162 m = re.match(self._VALID_URL, url,re.VERBOSE)
2163 video_name = m.group('name')
2164 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2165 self.report_extraction(video_name)
2166 # If the url includes the language we get the title translated
2167 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2168 webpage, 'title')
2169 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2170 webpage, 'json data')
2171 info = json.loads(json_data)
2172 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2173 webpage, 'description', flags = re.DOTALL)
2174
2175 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2176 webpage, 'thumbnail')
2177 info = {
2178 'id': info['id'],
2179 'url': info['htmlStreams'][-1]['file'],
2180 'ext': 'mp4',
2181 'title': title,
2182 'thumbnail': thumbnail,
2183 'description': desc,
2184 }
2185 return info
2186
2187 class MySpassIE(InfoExtractor):
2188 _VALID_URL = r'http://www.myspass.de/.*'
2189
2190 def _real_extract(self, url):
2191 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2192
2193 # video id is the last path element of the URL
2194 # usually there is a trailing slash, so also try the second but last
2195 url_path = compat_urllib_parse_urlparse(url).path
2196 url_parent_path, video_id = os.path.split(url_path)
2197 if not video_id:
2198 _, video_id = os.path.split(url_parent_path)
2199
2200 # get metadata
2201 metadata_url = META_DATA_URL_TEMPLATE % video_id
2202 metadata_text = self._download_webpage(metadata_url, video_id)
2203 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2204
2205 # extract values from metadata
2206 url_flv_el = metadata.find('url_flv')
2207 if url_flv_el is None:
2208 raise ExtractorError(u'Unable to extract download url')
2209 video_url = url_flv_el.text
2210 extension = os.path.splitext(video_url)[1][1:]
2211 title_el = metadata.find('title')
2212 if title_el is None:
2213 raise ExtractorError(u'Unable to extract title')
2214 title = title_el.text
2215 format_id_el = metadata.find('format_id')
2216 if format_id_el is None:
2217 format = ext
2218 else:
2219 format = format_id_el.text
2220 description_el = metadata.find('description')
2221 if description_el is not None:
2222 description = description_el.text
2223 else:
2224 description = None
2225 imagePreview_el = metadata.find('imagePreview')
2226 if imagePreview_el is not None:
2227 thumbnail = imagePreview_el.text
2228 else:
2229 thumbnail = None
2230 info = {
2231 'id': video_id,
2232 'url': video_url,
2233 'title': title,
2234 'ext': extension,
2235 'format': format,
2236 'thumbnail': thumbnail,
2237 'description': description
2238 }
2239 return [info]
2240
2241 class SpiegelIE(InfoExtractor):
2242 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2243
2244 def _real_extract(self, url):
2245 m = re.match(self._VALID_URL, url)
2246 video_id = m.group('videoID')
2247
2248 webpage = self._download_webpage(url, video_id)
2249
2250 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2251 webpage, u'title')
2252
2253 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2254 xml_code = self._download_webpage(xml_url, video_id,
2255 note=u'Downloading XML', errnote=u'Failed to download XML')
2256
2257 idoc = xml.etree.ElementTree.fromstring(xml_code)
2258 last_type = idoc[-1]
2259 filename = last_type.findall('./filename')[0].text
2260 duration = float(last_type.findall('./duration')[0].text)
2261
2262 video_url = 'http://video2.spiegel.de/flash/' + filename
2263 video_ext = filename.rpartition('.')[2]
2264 info = {
2265 'id': video_id,
2266 'url': video_url,
2267 'ext': video_ext,
2268 'title': video_title,
2269 'duration': duration,
2270 }
2271 return [info]
2272
2273 class LiveLeakIE(InfoExtractor):
2274
2275 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2276 IE_NAME = u'liveleak'
2277
2278 def _real_extract(self, url):
2279 mobj = re.match(self._VALID_URL, url)
2280 if mobj is None:
2281 raise ExtractorError(u'Invalid URL: %s' % url)
2282
2283 video_id = mobj.group('video_id')
2284
2285 webpage = self._download_webpage(url, video_id)
2286
2287 video_url = self._search_regex(r'file: "(.*?)",',
2288 webpage, u'video URL')
2289
2290 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2291 webpage, u'title').replace('LiveLeak.com -', '').strip()
2292
2293 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2294 webpage, u'description', fatal=False)
2295
2296 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2297 webpage, u'uploader', fatal=False)
2298
2299 info = {
2300 'id': video_id,
2301 'url': video_url,
2302 'ext': 'mp4',
2303 'title': video_title,
2304 'description': video_description,
2305 'uploader': video_uploader
2306 }
2307
2308 return [info]
2309
2310
2311
2312 class TumblrIE(InfoExtractor):
2313 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2314
2315 def _real_extract(self, url):
2316 m_url = re.match(self._VALID_URL, url)
2317 video_id = m_url.group('id')
2318 blog = m_url.group('blog_name')
2319
2320 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2321 webpage = self._download_webpage(url, video_id)
2322
2323 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2324 video = re.search(re_video, webpage)
2325 if video is None:
2326 raise ExtractorError(u'Unable to extract video')
2327 video_url = video.group('video_url')
2328 ext = video.group('ext')
2329
2330 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2331 webpage, u'thumbnail', fatal=False) # We pick the first poster
2332 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2333
2334 # The only place where you can get a title, it's not complete,
2335 # but searching in other places doesn't work for all videos
2336 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2337 webpage, u'title', flags=re.DOTALL)
2338
2339 return [{'id': video_id,
2340 'url': video_url,
2341 'title': video_title,
2342 'thumbnail': video_thumbnail,
2343 'ext': ext
2344 }]
2345
2346 class BandcampIE(InfoExtractor):
2347 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2348
2349 def _real_extract(self, url):
2350 mobj = re.match(self._VALID_URL, url)
2351 title = mobj.group('title')
2352 webpage = self._download_webpage(url, title)
2353 # We get the link to the free download page
2354 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2355 if m_download is None:
2356 raise ExtractorError(u'No free songs found')
2357
2358 download_link = m_download.group(1)
2359 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2360 webpage, re.MULTILINE|re.DOTALL).group('id')
2361
2362 download_webpage = self._download_webpage(download_link, id,
2363 'Downloading free downloads page')
2364 # We get the dictionary of the track from some javascrip code
2365 info = re.search(r'items: (.*?),$',
2366 download_webpage, re.MULTILINE).group(1)
2367 info = json.loads(info)[0]
2368 # We pick mp3-320 for now, until format selection can be easily implemented.
2369 mp3_info = info[u'downloads'][u'mp3-320']
2370 # If we try to use this url it says the link has expired
2371 initial_url = mp3_info[u'url']
2372 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2373 m_url = re.match(re_url, initial_url)
2374 #We build the url we will use to get the final track url
2375 # This url is build in Bandcamp in the script download_bunde_*.js
2376 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2377 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2378 # If we could correctly generate the .rand field the url would be
2379 #in the "download_url" key
2380 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2381
2382 track_info = {'id':id,
2383 'title' : info[u'title'],
2384 'ext' : 'mp3',
2385 'url' : final_url,
2386 'thumbnail' : info[u'thumb_url'],
2387 'uploader' : info[u'artist']
2388 }
2389
2390 return [track_info]
2391
2392 class RedTubeIE(InfoExtractor):
2393 """Information Extractor for redtube"""
2394 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2395
2396 def _real_extract(self,url):
2397 mobj = re.match(self._VALID_URL, url)
2398 if mobj is None:
2399 raise ExtractorError(u'Invalid URL: %s' % url)
2400
2401 video_id = mobj.group('id')
2402 video_extension = 'mp4'
2403 webpage = self._download_webpage(url, video_id)
2404
2405 self.report_extraction(video_id)
2406
2407 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2408 webpage, u'video URL')
2409
2410 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2411 webpage, u'title')
2412
2413 return [{
2414 'id': video_id,
2415 'url': video_url,
2416 'ext': video_extension,
2417 'title': video_title,
2418 }]
2419
2420 class InaIE(InfoExtractor):
2421 """Information Extractor for Ina.fr"""
2422 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2423
2424 def _real_extract(self,url):
2425 mobj = re.match(self._VALID_URL, url)
2426
2427 video_id = mobj.group('id')
2428 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2429 video_extension = 'mp4'
2430 webpage = self._download_webpage(mrss_url, video_id)
2431
2432 self.report_extraction(video_id)
2433
2434 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2435 webpage, u'video URL')
2436
2437 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2438 webpage, u'title')
2439
2440 return [{
2441 'id': video_id,
2442 'url': video_url,
2443 'ext': video_extension,
2444 'title': video_title,
2445 }]
2446
2447 class HowcastIE(InfoExtractor):
2448 """Information Extractor for Howcast.com"""
2449 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2450
2451 def _real_extract(self, url):
2452 mobj = re.match(self._VALID_URL, url)
2453
2454 video_id = mobj.group('id')
2455 webpage_url = 'http://www.howcast.com/videos/' + video_id
2456 webpage = self._download_webpage(webpage_url, video_id)
2457
2458 self.report_extraction(video_id)
2459
2460 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2461 webpage, u'video URL')
2462
2463 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2464 webpage, u'title')
2465
2466 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2467 webpage, u'description', fatal=False)
2468
2469 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2470 webpage, u'thumbnail', fatal=False)
2471
2472 return [{
2473 'id': video_id,
2474 'url': video_url,
2475 'ext': 'mp4',
2476 'title': video_title,
2477 'description': video_description,
2478 'thumbnail': thumbnail,
2479 }]
2480
2481 class VineIE(InfoExtractor):
2482 """Information Extractor for Vine.co"""
2483 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2484
2485 def _real_extract(self, url):
2486 mobj = re.match(self._VALID_URL, url)
2487
2488 video_id = mobj.group('id')
2489 webpage_url = 'https://vine.co/v/' + video_id
2490 webpage = self._download_webpage(webpage_url, video_id)
2491
2492 self.report_extraction(video_id)
2493
2494 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2495 webpage, u'video URL')
2496
2497 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2498 webpage, u'title')
2499
2500 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2501 webpage, u'thumbnail', fatal=False)
2502
2503 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2504 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2505
2506 return [{
2507 'id': video_id,
2508 'url': video_url,
2509 'ext': 'mp4',
2510 'title': video_title,
2511 'thumbnail': thumbnail,
2512 'uploader': uploader,
2513 }]
2514
2515 class FlickrIE(InfoExtractor):
2516 """Information Extractor for Flickr videos"""
2517 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2518
2519 def _real_extract(self, url):
2520 mobj = re.match(self._VALID_URL, url)
2521
2522 video_id = mobj.group('id')
2523 video_uploader_id = mobj.group('uploader_id')
2524 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2525 webpage = self._download_webpage(webpage_url, video_id)
2526
2527 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2528
2529 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2530 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2531
2532 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2533 first_xml, u'node_id')
2534
2535 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2536 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2537
2538 self.report_extraction(video_id)
2539
2540 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2541 if mobj is None:
2542 raise ExtractorError(u'Unable to extract video url')
2543 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2544
2545 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2546 webpage, u'video title')
2547
2548 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2549 webpage, u'description', fatal=False)
2550
2551 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2552 webpage, u'thumbnail', fatal=False)
2553
2554 return [{
2555 'id': video_id,
2556 'url': video_url,
2557 'ext': 'mp4',
2558 'title': video_title,
2559 'description': video_description,
2560 'thumbnail': thumbnail,
2561 'uploader_id': video_uploader_id,
2562 }]
2563
2564 class TeamcocoIE(InfoExtractor):
2565 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2566
2567 def _real_extract(self, url):
2568 mobj = re.match(self._VALID_URL, url)
2569 if mobj is None:
2570 raise ExtractorError(u'Invalid URL: %s' % url)
2571 url_title = mobj.group('url_title')
2572 webpage = self._download_webpage(url, url_title)
2573
2574 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2575 webpage, u'video id')
2576
2577 self.report_extraction(video_id)
2578
2579 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2580 webpage, u'title')
2581
2582 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2583 webpage, u'thumbnail', fatal=False)
2584
2585 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2586 webpage, u'description', fatal=False)
2587
2588 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2589 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2590
2591 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2592 data, u'video URL')
2593
2594 return [{
2595 'id': video_id,
2596 'url': video_url,
2597 'ext': 'mp4',
2598 'title': video_title,
2599 'thumbnail': thumbnail,
2600 'description': video_description,
2601 }]
2602
2603 class XHamsterIE(InfoExtractor):
2604 """Information Extractor for xHamster"""
2605 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2606
2607 def _real_extract(self,url):
2608 mobj = re.match(self._VALID_URL, url)
2609
2610 video_id = mobj.group('id')
2611 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2612 webpage = self._download_webpage(mrss_url, video_id)
2613
2614 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2615 if mobj is None:
2616 raise ExtractorError(u'Unable to extract media URL')
2617 if len(mobj.group('server')) == 0:
2618 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2619 else:
2620 video_url = mobj.group('server')+'/key='+mobj.group('file')
2621 video_extension = video_url.split('.')[-1]
2622
2623 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2624 webpage, u'title')
2625
2626 # Can't see the description anywhere in the UI
2627 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2628 # webpage, u'description', fatal=False)
2629 # if video_description: video_description = unescapeHTML(video_description)
2630
2631 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2632 if mobj:
2633 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2634 else:
2635 video_upload_date = None
2636 self._downloader.report_warning(u'Unable to extract upload date')
2637
2638 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2639 webpage, u'uploader id', default=u'anonymous')
2640
2641 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2642 webpage, u'thumbnail', fatal=False)
2643
2644 return [{
2645 'id': video_id,
2646 'url': video_url,
2647 'ext': video_extension,
2648 'title': video_title,
2649 # 'description': video_description,
2650 'upload_date': video_upload_date,
2651 'uploader_id': video_uploader_id,
2652 'thumbnail': video_thumbnail
2653 }]
2654
2655 class HypemIE(InfoExtractor):
2656 """Information Extractor for hypem"""
2657 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2658
2659 def _real_extract(self, url):
2660 mobj = re.match(self._VALID_URL, url)
2661 if mobj is None:
2662 raise ExtractorError(u'Invalid URL: %s' % url)
2663 track_id = mobj.group(1)
2664
2665 data = { 'ax': 1, 'ts': time.time() }
2666 data_encoded = compat_urllib_parse.urlencode(data)
2667 complete_url = url + "?" + data_encoded
2668 request = compat_urllib_request.Request(complete_url)
2669 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2670 cookie = urlh.headers.get('Set-Cookie', '')
2671
2672 self.report_extraction(track_id)
2673
2674 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2675 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2676 try:
2677 track_list = json.loads(html_tracks)
2678 track = track_list[u'tracks'][0]
2679 except ValueError:
2680 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2681
2682 key = track[u"key"]
2683 track_id = track[u"id"]
2684 artist = track[u"artist"]
2685 title = track[u"song"]
2686
2687 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2688 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2689 request.add_header('cookie', cookie)
2690 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2691 try:
2692 song_data = json.loads(song_data_json)
2693 except ValueError:
2694 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2695 final_url = song_data[u"url"]
2696
2697 return [{
2698 'id': track_id,
2699 'url': final_url,
2700 'ext': "mp3",
2701 'title': title,
2702 'artist': artist,
2703 }]
2704
2705 class Vbox7IE(InfoExtractor):
2706 """Information Extractor for Vbox7"""
2707 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2708
2709 def _real_extract(self,url):
2710 mobj = re.match(self._VALID_URL, url)
2711 if mobj is None:
2712 raise ExtractorError(u'Invalid URL: %s' % url)
2713 video_id = mobj.group(1)
2714
2715 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2716 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2717 redirect_url = urlh.geturl() + new_location
2718 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2719
2720 title = self._html_search_regex(r'<title>(.*)</title>',
2721 webpage, u'title').split('/')[0].strip()
2722
2723 ext = "flv"
2724 info_url = "http://vbox7.com/play/magare.do"
2725 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2726 info_request = compat_urllib_request.Request(info_url, data)
2727 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2728 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2729 if info_response is None:
2730 raise ExtractorError(u'Unable to extract the media url')
2731 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2732
2733 return [{
2734 'id': video_id,
2735 'url': final_url,
2736 'ext': ext,
2737 'title': title,
2738 'thumbnail': thumbnail_url,
2739 }]
2740
2741
2742 def gen_extractors():
2743 """ Return a list of an instance of every supported extractor.
2744 The order does matter; the first extractor matched is the one handling the URL.
2745 """
2746 return [
2747 YoutubePlaylistIE(),
2748 YoutubeChannelIE(),
2749 YoutubeUserIE(),
2750 YoutubeSearchIE(),
2751 YoutubeIE(),
2752 MetacafeIE(),
2753 DailymotionIE(),
2754 GoogleSearchIE(),
2755 PhotobucketIE(),
2756 YahooIE(),
2757 YahooSearchIE(),
2758 DepositFilesIE(),
2759 FacebookIE(),
2760 BlipTVIE(),
2761 BlipTVUserIE(),
2762 VimeoIE(),
2763 MyVideoIE(),
2764 ComedyCentralIE(),
2765 EscapistIE(),
2766 CollegeHumorIE(),
2767 XVideosIE(),
2768 SoundcloudSetIE(),
2769 SoundcloudIE(),
2770 InfoQIE(),
2771 MixcloudIE(),
2772 StanfordOpenClassroomIE(),
2773 MTVIE(),
2774 YoukuIE(),
2775 XNXXIE(),
2776 YouJizzIE(),
2777 PornotubeIE(),
2778 YouPornIE(),
2779 GooglePlusIE(),
2780 ArteTvIE(),
2781 NBAIE(),
2782 WorldStarHipHopIE(),
2783 JustinTVIE(),
2784 FunnyOrDieIE(),
2785 SteamIE(),
2786 UstreamIE(),
2787 RBMARadioIE(),
2788 EightTracksIE(),
2789 KeekIE(),
2790 TEDIE(),
2791 MySpassIE(),
2792 SpiegelIE(),
2793 LiveLeakIE(),
2794 ARDIE(),
2795 ZDFIE(),
2796 TumblrIE(),
2797 BandcampIE(),
2798 RedTubeIE(),
2799 InaIE(),
2800 HowcastIE(),
2801 VineIE(),
2802 FlickrIE(),
2803 TeamcocoIE(),
2804 XHamsterIE(),
2805 HypemIE(),
2806 Vbox7IE(),
2807 GametrailersIE(),
2808 StatigramIE(),
2809 GenericIE()
2810 ]
2811
2812 def get_info_extractor(ie_name):
2813 """Returns the info extractor class with the given ie_name"""
2814 return globals()[ie_name+'IE']