]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Move GoogleSearchIE into its own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
25
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.gametrailers import GametrailersIE
30 from .extractor.generic import GenericIE
31 from .extractor.metacafe import MetacafeIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.vimeo import VimeoIE
35 from .extractor.yahoo import YahooIE
36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
37 from .extractor.zdf import ZDFIE
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53 class YahooSearchIE(SearchInfoExtractor):
54 """Information Extractor for Yahoo! Video search queries."""
55
56 _MAX_RESULTS = 1000
57 IE_NAME = u'screen.yahoo:search'
58 _SEARCH_KEY = 'yvsearch'
59
60 def _get_n_results(self, query, n):
61 """Get a specified number of results for a query"""
62
63 res = {
64 '_type': 'playlist',
65 'id': query,
66 'entries': []
67 }
68 for pagenum in itertools.count(0):
69 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
70 webpage = self._download_webpage(result_url, query,
71 note='Downloading results page '+str(pagenum+1))
72 info = json.loads(webpage)
73 m = info[u'm']
74 results = info[u'results']
75
76 for (i, r) in enumerate(results):
77 if (pagenum * 30) +i >= n:
78 break
79 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
80 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
81 res['entries'].append(e)
82 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
83 break
84
85 return res
86
87
88 class BlipTVUserIE(InfoExtractor):
89 """Information Extractor for blip.tv users."""
90
91 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
92 _PAGE_SIZE = 12
93 IE_NAME = u'blip.tv:user'
94
95 def _real_extract(self, url):
96 # Extract username
97 mobj = re.match(self._VALID_URL, url)
98 if mobj is None:
99 raise ExtractorError(u'Invalid URL: %s' % url)
100
101 username = mobj.group(1)
102
103 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
104
105 page = self._download_webpage(url, username, u'Downloading user page')
106 mobj = re.search(r'data-users-id="([^"]+)"', page)
107 page_base = page_base % mobj.group(1)
108
109
110 # Download video ids using BlipTV Ajax calls. Result size per
111 # query is limited (currently to 12 videos) so we need to query
112 # page by page until there are no video ids - it means we got
113 # all of them.
114
115 video_ids = []
116 pagenum = 1
117
118 while True:
119 url = page_base + "&page=" + str(pagenum)
120 page = self._download_webpage(url, username,
121 u'Downloading video ids from page %d' % pagenum)
122
123 # Extract video identifiers
124 ids_in_page = []
125
126 for mobj in re.finditer(r'href="/([^"]+)"', page):
127 if mobj.group(1) not in ids_in_page:
128 ids_in_page.append(unescapeHTML(mobj.group(1)))
129
130 video_ids.extend(ids_in_page)
131
132 # A little optimization - if current page is not
133 # "full", ie. does not contain PAGE_SIZE video ids then
134 # we can assume that this page is the last one - there
135 # are no more ids on further pages - no need to query
136 # again.
137
138 if len(ids_in_page) < self._PAGE_SIZE:
139 break
140
141 pagenum += 1
142
143 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
144 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
145 return [self.playlist_result(url_entries, playlist_title = username)]
146
147
148 class DepositFilesIE(InfoExtractor):
149 """Information extractor for depositfiles.com"""
150
151 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
152
153 def _real_extract(self, url):
154 file_id = url.split('/')[-1]
155 # Rebuild url in english locale
156 url = 'http://depositfiles.com/en/files/' + file_id
157
158 # Retrieve file webpage with 'Free download' button pressed
159 free_download_indication = { 'gateway_result' : '1' }
160 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
161 try:
162 self.report_download_webpage(file_id)
163 webpage = compat_urllib_request.urlopen(request).read()
164 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
165 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
166
167 # Search for the real file URL
168 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
169 if (mobj is None) or (mobj.group(1) is None):
170 # Try to figure out reason of the error.
171 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
172 if (mobj is not None) and (mobj.group(1) is not None):
173 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
174 raise ExtractorError(u'%s' % restriction_message)
175 else:
176 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
177
178 file_url = mobj.group(1)
179 file_extension = os.path.splitext(file_url)[1][1:]
180
181 # Search for file title
182 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
183
184 return [{
185 'id': file_id.decode('utf-8'),
186 'url': file_url.decode('utf-8'),
187 'uploader': None,
188 'upload_date': None,
189 'title': file_title,
190 'ext': file_extension.decode('utf-8'),
191 }]
192
193
194 class FacebookIE(InfoExtractor):
195 """Information Extractor for Facebook"""
196
197 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
198 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
199 _NETRC_MACHINE = 'facebook'
200 IE_NAME = u'facebook'
201
202 def report_login(self):
203 """Report attempt to log in."""
204 self.to_screen(u'Logging in')
205
206 def _real_initialize(self):
207 if self._downloader is None:
208 return
209
210 useremail = None
211 password = None
212 downloader_params = self._downloader.params
213
214 # Attempt to use provided username and password or .netrc data
215 if downloader_params.get('username', None) is not None:
216 useremail = downloader_params['username']
217 password = downloader_params['password']
218 elif downloader_params.get('usenetrc', False):
219 try:
220 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
221 if info is not None:
222 useremail = info[0]
223 password = info[2]
224 else:
225 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
226 except (IOError, netrc.NetrcParseError) as err:
227 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
228 return
229
230 if useremail is None:
231 return
232
233 # Log in
234 login_form = {
235 'email': useremail,
236 'pass': password,
237 'login': 'Log+In'
238 }
239 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
240 try:
241 self.report_login()
242 login_results = compat_urllib_request.urlopen(request).read()
243 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
244 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
245 return
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
248 return
249
250 def _real_extract(self, url):
251 mobj = re.match(self._VALID_URL, url)
252 if mobj is None:
253 raise ExtractorError(u'Invalid URL: %s' % url)
254 video_id = mobj.group('ID')
255
256 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
257 webpage = self._download_webpage(url, video_id)
258
259 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
260 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
261 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
262 if not m:
263 raise ExtractorError(u'Cannot parse data')
264 data = dict(json.loads(m.group(1)))
265 params_raw = compat_urllib_parse.unquote(data['params'])
266 params = json.loads(params_raw)
267 video_data = params['video_data'][0]
268 video_url = video_data.get('hd_src')
269 if not video_url:
270 video_url = video_data['sd_src']
271 if not video_url:
272 raise ExtractorError(u'Cannot find video URL')
273 video_duration = int(video_data['video_duration'])
274 thumbnail = video_data['thumbnail_src']
275
276 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
277 webpage, u'title')
278
279 info = {
280 'id': video_id,
281 'title': video_title,
282 'url': video_url,
283 'ext': 'mp4',
284 'duration': video_duration,
285 'thumbnail': thumbnail,
286 }
287 return [info]
288
289
290 class BlipTVIE(InfoExtractor):
291 """Information extractor for blip.tv"""
292
293 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
294 _URL_EXT = r'^.*\.([a-z0-9]+)$'
295 IE_NAME = u'blip.tv'
296
297 def report_direct_download(self, title):
298 """Report information extraction."""
299 self.to_screen(u'%s: Direct download detected' % title)
300
301 def _real_extract(self, url):
302 mobj = re.match(self._VALID_URL, url)
303 if mobj is None:
304 raise ExtractorError(u'Invalid URL: %s' % url)
305
306 # See https://github.com/rg3/youtube-dl/issues/857
307 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
308 if api_mobj is not None:
309 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
310 urlp = compat_urllib_parse_urlparse(url)
311 if urlp.path.startswith('/play/'):
312 request = compat_urllib_request.Request(url)
313 response = compat_urllib_request.urlopen(request)
314 redirecturl = response.geturl()
315 rurlp = compat_urllib_parse_urlparse(redirecturl)
316 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
317 url = 'http://blip.tv/a/a-' + file_id
318 return self._real_extract(url)
319
320
321 if '?' in url:
322 cchar = '&'
323 else:
324 cchar = '?'
325 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
326 request = compat_urllib_request.Request(json_url)
327 request.add_header('User-Agent', 'iTunes/10.6.1')
328 self.report_extraction(mobj.group(1))
329 info = None
330 try:
331 urlh = compat_urllib_request.urlopen(request)
332 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
333 basename = url.split('/')[-1]
334 title,ext = os.path.splitext(basename)
335 title = title.decode('UTF-8')
336 ext = ext.replace('.', '')
337 self.report_direct_download(title)
338 info = {
339 'id': title,
340 'url': url,
341 'uploader': None,
342 'upload_date': None,
343 'title': title,
344 'ext': ext,
345 'urlhandle': urlh
346 }
347 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
348 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
349 if info is None: # Regular URL
350 try:
351 json_code_bytes = urlh.read()
352 json_code = json_code_bytes.decode('utf-8')
353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
354 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
355
356 try:
357 json_data = json.loads(json_code)
358 if 'Post' in json_data:
359 data = json_data['Post']
360 else:
361 data = json_data
362
363 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
364 video_url = data['media']['url']
365 umobj = re.match(self._URL_EXT, video_url)
366 if umobj is None:
367 raise ValueError('Can not determine filename extension')
368 ext = umobj.group(1)
369
370 info = {
371 'id': data['item_id'],
372 'url': video_url,
373 'uploader': data['display_name'],
374 'upload_date': upload_date,
375 'title': data['title'],
376 'ext': ext,
377 'format': data['media']['mimeType'],
378 'thumbnail': data['thumbnailUrl'],
379 'description': data['description'],
380 'player_url': data['embedUrl'],
381 'user_agent': 'iTunes/10.6.1',
382 }
383 except (ValueError,KeyError) as err:
384 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
385
386 return [info]
387
388
389 class MyVideoIE(InfoExtractor):
390 """Information Extractor for myvideo.de."""
391
392 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
393 IE_NAME = u'myvideo'
394
395 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
396 # Released into the Public Domain by Tristan Fischer on 2013-05-19
397 # https://github.com/rg3/youtube-dl/pull/842
398 def __rc4crypt(self,data, key):
399 x = 0
400 box = list(range(256))
401 for i in list(range(256)):
402 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
403 box[i], box[x] = box[x], box[i]
404 x = 0
405 y = 0
406 out = ''
407 for char in data:
408 x = (x + 1) % 256
409 y = (y + box[x]) % 256
410 box[x], box[y] = box[y], box[x]
411 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
412 return out
413
414 def __md5(self,s):
415 return hashlib.md5(s).hexdigest().encode()
416
417 def _real_extract(self,url):
418 mobj = re.match(self._VALID_URL, url)
419 if mobj is None:
420 raise ExtractorError(u'invalid URL: %s' % url)
421
422 video_id = mobj.group(1)
423
424 GK = (
425 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
426 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
427 b'TnpsbA0KTVRkbU1tSTRNdz09'
428 )
429
430 # Get video webpage
431 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
432 webpage = self._download_webpage(webpage_url, video_id)
433
434 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
435 if mobj is not None:
436 self.report_extraction(video_id)
437 video_url = mobj.group(1) + '.flv'
438
439 video_title = self._html_search_regex('<title>([^<]+)</title>',
440 webpage, u'title')
441
442 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
443
444 return [{
445 'id': video_id,
446 'url': video_url,
447 'uploader': None,
448 'upload_date': None,
449 'title': video_title,
450 'ext': u'flv',
451 }]
452
453 # try encxml
454 mobj = re.search('var flashvars={(.+?)}', webpage)
455 if mobj is None:
456 raise ExtractorError(u'Unable to extract video')
457
458 params = {}
459 encxml = ''
460 sec = mobj.group(1)
461 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
462 if not a == '_encxml':
463 params[a] = b
464 else:
465 encxml = compat_urllib_parse.unquote(b)
466 if not params.get('domain'):
467 params['domain'] = 'www.myvideo.de'
468 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
469 if 'flash_playertype=MTV' in xmldata_url:
470 self._downloader.report_warning(u'avoiding MTV player')
471 xmldata_url = (
472 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
473 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
474 ) % video_id
475
476 # get enc data
477 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
478 enc_data_b = binascii.unhexlify(enc_data)
479 sk = self.__md5(
480 base64.b64decode(base64.b64decode(GK)) +
481 self.__md5(
482 str(video_id).encode('utf-8')
483 )
484 )
485 dec_data = self.__rc4crypt(enc_data_b, sk)
486
487 # extracting infos
488 self.report_extraction(video_id)
489
490 video_url = None
491 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
492 if mobj:
493 video_url = compat_urllib_parse.unquote(mobj.group(1))
494 if 'myvideo2flash' in video_url:
495 self._downloader.report_warning(u'forcing RTMPT ...')
496 video_url = video_url.replace('rtmpe://', 'rtmpt://')
497
498 if not video_url:
499 # extract non rtmp videos
500 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
501 if mobj is None:
502 raise ExtractorError(u'unable to extract url')
503 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
504
505 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
506 video_file = compat_urllib_parse.unquote(video_file)
507
508 if not video_file.endswith('f4m'):
509 ppath, prefix = video_file.split('.')
510 video_playpath = '%s:%s' % (prefix, ppath)
511 video_hls_playlist = ''
512 else:
513 video_playpath = ''
514 video_hls_playlist = (
515 video_filepath + video_file
516 ).replace('.f4m', '.m3u8')
517
518 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
519 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
520
521 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
522 webpage, u'title')
523
524 return [{
525 'id': video_id,
526 'url': video_url,
527 'tc_url': video_url,
528 'uploader': None,
529 'upload_date': None,
530 'title': video_title,
531 'ext': u'flv',
532 'play_path': video_playpath,
533 'video_file': video_file,
534 'video_hls_playlist': video_hls_playlist,
535 'player_url': video_swfobj,
536 }]
537
538
539 class ComedyCentralIE(InfoExtractor):
540 """Information extractor for The Daily Show and Colbert Report """
541
542 # urls can be abbreviations like :thedailyshow or :colbert
543 # urls for episodes like:
544 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
545 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
546 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
547 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
548 |(https?://)?(www\.)?
549 (?P<showname>thedailyshow|colbertnation)\.com/
550 (full-episodes/(?P<episode>.*)|
551 (?P<clip>
552 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
553 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
554 $"""
555
556 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
557
558 _video_extensions = {
559 '3500': 'mp4',
560 '2200': 'mp4',
561 '1700': 'mp4',
562 '1200': 'mp4',
563 '750': 'mp4',
564 '400': 'mp4',
565 }
566 _video_dimensions = {
567 '3500': '1280x720',
568 '2200': '960x540',
569 '1700': '768x432',
570 '1200': '640x360',
571 '750': '512x288',
572 '400': '384x216',
573 }
574
575 @classmethod
576 def suitable(cls, url):
577 """Receives a URL and returns True if suitable for this IE."""
578 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
579
580 def _print_formats(self, formats):
581 print('Available formats:')
582 for x in formats:
583 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
584
585
586 def _real_extract(self, url):
587 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
588 if mobj is None:
589 raise ExtractorError(u'Invalid URL: %s' % url)
590
591 if mobj.group('shortname'):
592 if mobj.group('shortname') in ('tds', 'thedailyshow'):
593 url = u'http://www.thedailyshow.com/full-episodes/'
594 else:
595 url = u'http://www.colbertnation.com/full-episodes/'
596 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
597 assert mobj is not None
598
599 if mobj.group('clip'):
600 if mobj.group('showname') == 'thedailyshow':
601 epTitle = mobj.group('tdstitle')
602 else:
603 epTitle = mobj.group('cntitle')
604 dlNewest = False
605 else:
606 dlNewest = not mobj.group('episode')
607 if dlNewest:
608 epTitle = mobj.group('showname')
609 else:
610 epTitle = mobj.group('episode')
611
612 self.report_extraction(epTitle)
613 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
614 if dlNewest:
615 url = htmlHandle.geturl()
616 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
617 if mobj is None:
618 raise ExtractorError(u'Invalid redirected URL: ' + url)
619 if mobj.group('episode') == '':
620 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
621 epTitle = mobj.group('episode')
622
623 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
624
625 if len(mMovieParams) == 0:
626 # The Colbert Report embeds the information in a without
627 # a URL prefix; so extract the alternate reference
628 # and then add the URL prefix manually.
629
630 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
631 if len(altMovieParams) == 0:
632 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
633 else:
634 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
635
636 uri = mMovieParams[0][1]
637 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
638 indexXml = self._download_webpage(indexUrl, epTitle,
639 u'Downloading show index',
640 u'unable to download episode index')
641
642 results = []
643
644 idoc = xml.etree.ElementTree.fromstring(indexXml)
645 itemEls = idoc.findall('.//item')
646 for partNum,itemEl in enumerate(itemEls):
647 mediaId = itemEl.findall('./guid')[0].text
648 shortMediaId = mediaId.split(':')[-1]
649 showId = mediaId.split(':')[-2].replace('.com', '')
650 officialTitle = itemEl.findall('./title')[0].text
651 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
652
653 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
654 compat_urllib_parse.urlencode({'uri': mediaId}))
655 configXml = self._download_webpage(configUrl, epTitle,
656 u'Downloading configuration for %s' % shortMediaId)
657
658 cdoc = xml.etree.ElementTree.fromstring(configXml)
659 turls = []
660 for rendition in cdoc.findall('.//rendition'):
661 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
662 turls.append(finfo)
663
664 if len(turls) == 0:
665 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
666 continue
667
668 if self._downloader.params.get('listformats', None):
669 self._print_formats([i[0] for i in turls])
670 return
671
672 # For now, just pick the highest bitrate
673 format,rtmp_video_url = turls[-1]
674
675 # Get the format arg from the arg stream
676 req_format = self._downloader.params.get('format', None)
677
678 # Select format if we can find one
679 for f,v in turls:
680 if f == req_format:
681 format, rtmp_video_url = f, v
682 break
683
684 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
685 if not m:
686 raise ExtractorError(u'Cannot transform RTMP url')
687 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
688 video_url = base + m.group('finalid')
689
690 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
691 info = {
692 'id': shortMediaId,
693 'url': video_url,
694 'uploader': showId,
695 'upload_date': officialDate,
696 'title': effTitle,
697 'ext': 'mp4',
698 'format': format,
699 'thumbnail': None,
700 'description': officialTitle,
701 }
702 results.append(info)
703
704 return results
705
706
707 class EscapistIE(InfoExtractor):
708 """Information extractor for The Escapist """
709
710 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
711 IE_NAME = u'escapist'
712
713 def _real_extract(self, url):
714 mobj = re.match(self._VALID_URL, url)
715 if mobj is None:
716 raise ExtractorError(u'Invalid URL: %s' % url)
717 showName = mobj.group('showname')
718 videoId = mobj.group('episode')
719
720 self.report_extraction(videoId)
721 webpage = self._download_webpage(url, videoId)
722
723 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
724 webpage, u'description', fatal=False)
725
726 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
727 webpage, u'thumbnail', fatal=False)
728
729 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
730 webpage, u'player url')
731
732 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
733 webpage, u'player url').split(' : ')[-1]
734
735 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
736 configUrl = compat_urllib_parse.unquote(configUrl)
737
738 configJSON = self._download_webpage(configUrl, videoId,
739 u'Downloading configuration',
740 u'unable to download configuration')
741
742 # Technically, it's JavaScript, not JSON
743 configJSON = configJSON.replace("'", '"')
744
745 try:
746 config = json.loads(configJSON)
747 except (ValueError,) as err:
748 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
749
750 playlist = config['playlist']
751 videoUrl = playlist[1]['url']
752
753 info = {
754 'id': videoId,
755 'url': videoUrl,
756 'uploader': showName,
757 'upload_date': None,
758 'title': title,
759 'ext': 'mp4',
760 'thumbnail': imgUrl,
761 'description': videoDesc,
762 'player_url': playerUrl,
763 }
764
765 return [info]
766
767 class CollegeHumorIE(InfoExtractor):
768 """Information extractor for collegehumor.com"""
769
770 _WORKING = False
771 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
772 IE_NAME = u'collegehumor'
773
774 def report_manifest(self, video_id):
775 """Report information extraction."""
776 self.to_screen(u'%s: Downloading XML manifest' % video_id)
777
778 def _real_extract(self, url):
779 mobj = re.match(self._VALID_URL, url)
780 if mobj is None:
781 raise ExtractorError(u'Invalid URL: %s' % url)
782 video_id = mobj.group('videoid')
783
784 info = {
785 'id': video_id,
786 'uploader': None,
787 'upload_date': None,
788 }
789
790 self.report_extraction(video_id)
791 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
792 try:
793 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
795 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
796
797 mdoc = xml.etree.ElementTree.fromstring(metaXml)
798 try:
799 videoNode = mdoc.findall('./video')[0]
800 info['description'] = videoNode.findall('./description')[0].text
801 info['title'] = videoNode.findall('./caption')[0].text
802 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
803 manifest_url = videoNode.findall('./file')[0].text
804 except IndexError:
805 raise ExtractorError(u'Invalid metadata XML file')
806
807 manifest_url += '?hdcore=2.10.3'
808 self.report_manifest(video_id)
809 try:
810 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
812 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
813
814 adoc = xml.etree.ElementTree.fromstring(manifestXml)
815 try:
816 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
817 node_id = media_node.attrib['url']
818 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
819 except IndexError as err:
820 raise ExtractorError(u'Invalid manifest file')
821
822 url_pr = compat_urllib_parse_urlparse(manifest_url)
823 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
824
825 info['url'] = url
826 info['ext'] = 'f4f'
827 return [info]
828
829
830 class XVideosIE(InfoExtractor):
831 """Information extractor for xvideos.com"""
832
833 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
834 IE_NAME = u'xvideos'
835
836 def _real_extract(self, url):
837 mobj = re.match(self._VALID_URL, url)
838 if mobj is None:
839 raise ExtractorError(u'Invalid URL: %s' % url)
840 video_id = mobj.group(1)
841
842 webpage = self._download_webpage(url, video_id)
843
844 self.report_extraction(video_id)
845
846 # Extract video URL
847 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
848 webpage, u'video URL'))
849
850 # Extract title
851 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
852 webpage, u'title')
853
854 # Extract video thumbnail
855 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
856 webpage, u'thumbnail', fatal=False)
857
858 info = {
859 'id': video_id,
860 'url': video_url,
861 'uploader': None,
862 'upload_date': None,
863 'title': video_title,
864 'ext': 'flv',
865 'thumbnail': video_thumbnail,
866 'description': None,
867 }
868
869 return [info]
870
871
872 class SoundcloudIE(InfoExtractor):
873 """Information extractor for soundcloud.com
874 To access the media, the uid of the song and a stream token
875 must be extracted from the page source and the script must make
876 a request to media.soundcloud.com/crossdomain.xml. Then
877 the media can be grabbed by requesting from an url composed
878 of the stream token and uid
879 """
880
881 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
882 IE_NAME = u'soundcloud'
883
884 def report_resolve(self, video_id):
885 """Report information extraction."""
886 self.to_screen(u'%s: Resolving id' % video_id)
887
888 def _real_extract(self, url):
889 mobj = re.match(self._VALID_URL, url)
890 if mobj is None:
891 raise ExtractorError(u'Invalid URL: %s' % url)
892
893 # extract uploader (which is in the url)
894 uploader = mobj.group(1)
895 # extract simple title (uploader + slug of song title)
896 slug_title = mobj.group(2)
897 simple_title = uploader + u'-' + slug_title
898 full_title = '%s/%s' % (uploader, slug_title)
899
900 self.report_resolve(full_title)
901
902 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
903 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
904 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
905
906 info = json.loads(info_json)
907 video_id = info['id']
908 self.report_extraction(full_title)
909
910 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
911 stream_json = self._download_webpage(streams_url, full_title,
912 u'Downloading stream definitions',
913 u'unable to download stream definitions')
914
915 streams = json.loads(stream_json)
916 mediaURL = streams['http_mp3_128_url']
917 upload_date = unified_strdate(info['created_at'])
918
919 return [{
920 'id': info['id'],
921 'url': mediaURL,
922 'uploader': info['user']['username'],
923 'upload_date': upload_date,
924 'title': info['title'],
925 'ext': u'mp3',
926 'description': info['description'],
927 }]
928
929 class SoundcloudSetIE(InfoExtractor):
930 """Information extractor for soundcloud.com sets
931 To access the media, the uid of the song and a stream token
932 must be extracted from the page source and the script must make
933 a request to media.soundcloud.com/crossdomain.xml. Then
934 the media can be grabbed by requesting from an url composed
935 of the stream token and uid
936 """
937
938 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
939 IE_NAME = u'soundcloud:set'
940
941 def report_resolve(self, video_id):
942 """Report information extraction."""
943 self.to_screen(u'%s: Resolving id' % video_id)
944
945 def _real_extract(self, url):
946 mobj = re.match(self._VALID_URL, url)
947 if mobj is None:
948 raise ExtractorError(u'Invalid URL: %s' % url)
949
950 # extract uploader (which is in the url)
951 uploader = mobj.group(1)
952 # extract simple title (uploader + slug of song title)
953 slug_title = mobj.group(2)
954 simple_title = uploader + u'-' + slug_title
955 full_title = '%s/sets/%s' % (uploader, slug_title)
956
957 self.report_resolve(full_title)
958
959 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
960 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
961 info_json = self._download_webpage(resolv_url, full_title)
962
963 videos = []
964 info = json.loads(info_json)
965 if 'errors' in info:
966 for err in info['errors']:
967 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
968 return
969
970 self.report_extraction(full_title)
971 for track in info['tracks']:
972 video_id = track['id']
973
974 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
975 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
976
977 self.report_extraction(video_id)
978 streams = json.loads(stream_json)
979 mediaURL = streams['http_mp3_128_url']
980
981 videos.append({
982 'id': video_id,
983 'url': mediaURL,
984 'uploader': track['user']['username'],
985 'upload_date': unified_strdate(track['created_at']),
986 'title': track['title'],
987 'ext': u'mp3',
988 'description': track['description'],
989 })
990 return videos
991
992
993 class InfoQIE(InfoExtractor):
994 """Information extractor for infoq.com"""
995 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
996
997 def _real_extract(self, url):
998 mobj = re.match(self._VALID_URL, url)
999 if mobj is None:
1000 raise ExtractorError(u'Invalid URL: %s' % url)
1001
1002 webpage = self._download_webpage(url, video_id=url)
1003 self.report_extraction(url)
1004
1005 # Extract video URL
1006 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1007 if mobj is None:
1008 raise ExtractorError(u'Unable to extract video url')
1009 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1010 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1011
1012 # Extract title
1013 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1014 webpage, u'title')
1015
1016 # Extract description
1017 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1018 webpage, u'description', fatal=False)
1019
1020 video_filename = video_url.split('/')[-1]
1021 video_id, extension = video_filename.split('.')
1022
1023 info = {
1024 'id': video_id,
1025 'url': video_url,
1026 'uploader': None,
1027 'upload_date': None,
1028 'title': video_title,
1029 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1030 'thumbnail': None,
1031 'description': video_description,
1032 }
1033
1034 return [info]
1035
1036 class MixcloudIE(InfoExtractor):
1037 """Information extractor for www.mixcloud.com"""
1038
1039 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1040 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1041 IE_NAME = u'mixcloud'
1042
1043 def report_download_json(self, file_id):
1044 """Report JSON download."""
1045 self.to_screen(u'Downloading json')
1046
1047 def get_urls(self, jsonData, fmt, bitrate='best'):
1048 """Get urls from 'audio_formats' section in json"""
1049 file_url = None
1050 try:
1051 bitrate_list = jsonData[fmt]
1052 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1053 bitrate = max(bitrate_list) # select highest
1054
1055 url_list = jsonData[fmt][bitrate]
1056 except TypeError: # we have no bitrate info.
1057 url_list = jsonData[fmt]
1058 return url_list
1059
1060 def check_urls(self, url_list):
1061 """Returns 1st active url from list"""
1062 for url in url_list:
1063 try:
1064 compat_urllib_request.urlopen(url)
1065 return url
1066 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1067 url = None
1068
1069 return None
1070
1071 def _print_formats(self, formats):
1072 print('Available formats:')
1073 for fmt in formats.keys():
1074 for b in formats[fmt]:
1075 try:
1076 ext = formats[fmt][b][0]
1077 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1078 except TypeError: # we have no bitrate info
1079 ext = formats[fmt][0]
1080 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1081 break
1082
1083 def _real_extract(self, url):
1084 mobj = re.match(self._VALID_URL, url)
1085 if mobj is None:
1086 raise ExtractorError(u'Invalid URL: %s' % url)
1087 # extract uploader & filename from url
1088 uploader = mobj.group(1).decode('utf-8')
1089 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1090
1091 # construct API request
1092 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1093 # retrieve .json file with links to files
1094 request = compat_urllib_request.Request(file_url)
1095 try:
1096 self.report_download_json(file_url)
1097 jsonData = compat_urllib_request.urlopen(request).read()
1098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1100
1101 # parse JSON
1102 json_data = json.loads(jsonData)
1103 player_url = json_data['player_swf_url']
1104 formats = dict(json_data['audio_formats'])
1105
1106 req_format = self._downloader.params.get('format', None)
1107 bitrate = None
1108
1109 if self._downloader.params.get('listformats', None):
1110 self._print_formats(formats)
1111 return
1112
1113 if req_format is None or req_format == 'best':
1114 for format_param in formats.keys():
1115 url_list = self.get_urls(formats, format_param)
1116 # check urls
1117 file_url = self.check_urls(url_list)
1118 if file_url is not None:
1119 break # got it!
1120 else:
1121 if req_format not in formats:
1122 raise ExtractorError(u'Format is not available')
1123
1124 url_list = self.get_urls(formats, req_format)
1125 file_url = self.check_urls(url_list)
1126 format_param = req_format
1127
1128 return [{
1129 'id': file_id.decode('utf-8'),
1130 'url': file_url.decode('utf-8'),
1131 'uploader': uploader.decode('utf-8'),
1132 'upload_date': None,
1133 'title': json_data['name'],
1134 'ext': file_url.split('.')[-1].decode('utf-8'),
1135 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1136 'thumbnail': json_data['thumbnail_url'],
1137 'description': json_data['description'],
1138 'player_url': player_url.decode('utf-8'),
1139 }]
1140
1141 class StanfordOpenClassroomIE(InfoExtractor):
1142 """Information extractor for Stanford's Open ClassRoom"""
1143
1144 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1145 IE_NAME = u'stanfordoc'
1146
1147 def _real_extract(self, url):
1148 mobj = re.match(self._VALID_URL, url)
1149 if mobj is None:
1150 raise ExtractorError(u'Invalid URL: %s' % url)
1151
1152 if mobj.group('course') and mobj.group('video'): # A specific video
1153 course = mobj.group('course')
1154 video = mobj.group('video')
1155 info = {
1156 'id': course + '_' + video,
1157 'uploader': None,
1158 'upload_date': None,
1159 }
1160
1161 self.report_extraction(info['id'])
1162 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1163 xmlUrl = baseUrl + video + '.xml'
1164 try:
1165 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1166 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1167 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1168 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1169 try:
1170 info['title'] = mdoc.findall('./title')[0].text
1171 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1172 except IndexError:
1173 raise ExtractorError(u'Invalid metadata XML file')
1174 info['ext'] = info['url'].rpartition('.')[2]
1175 return [info]
1176 elif mobj.group('course'): # A course page
1177 course = mobj.group('course')
1178 info = {
1179 'id': course,
1180 'type': 'playlist',
1181 'uploader': None,
1182 'upload_date': None,
1183 }
1184
1185 coursepage = self._download_webpage(url, info['id'],
1186 note='Downloading course info page',
1187 errnote='Unable to download course info page')
1188
1189 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1190
1191 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1192 coursepage, u'description', fatal=False)
1193
1194 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1195 info['list'] = [
1196 {
1197 'type': 'reference',
1198 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1199 }
1200 for vpage in links]
1201 results = []
1202 for entry in info['list']:
1203 assert entry['type'] == 'reference'
1204 results += self.extract(entry['url'])
1205 return results
1206 else: # Root page
1207 info = {
1208 'id': 'Stanford OpenClassroom',
1209 'type': 'playlist',
1210 'uploader': None,
1211 'upload_date': None,
1212 }
1213
1214 self.report_download_webpage(info['id'])
1215 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1216 try:
1217 rootpage = compat_urllib_request.urlopen(rootURL).read()
1218 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1219 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1220
1221 info['title'] = info['id']
1222
1223 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1224 info['list'] = [
1225 {
1226 'type': 'reference',
1227 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1228 }
1229 for cpage in links]
1230
1231 results = []
1232 for entry in info['list']:
1233 assert entry['type'] == 'reference'
1234 results += self.extract(entry['url'])
1235 return results
1236
1237 class MTVIE(InfoExtractor):
1238 """Information extractor for MTV.com"""
1239
1240 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1241 IE_NAME = u'mtv'
1242
1243 def _real_extract(self, url):
1244 mobj = re.match(self._VALID_URL, url)
1245 if mobj is None:
1246 raise ExtractorError(u'Invalid URL: %s' % url)
1247 if not mobj.group('proto'):
1248 url = 'http://' + url
1249 video_id = mobj.group('videoid')
1250
1251 webpage = self._download_webpage(url, video_id)
1252
1253 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1254 webpage, u'song name', fatal=False)
1255
1256 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1257 webpage, u'title')
1258
1259 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1260 webpage, u'mtvn_uri', fatal=False)
1261
1262 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1263 webpage, u'content id', fatal=False)
1264
1265 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1266 self.report_extraction(video_id)
1267 request = compat_urllib_request.Request(videogen_url)
1268 try:
1269 metadataXml = compat_urllib_request.urlopen(request).read()
1270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1271 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1272
1273 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1274 renditions = mdoc.findall('.//rendition')
1275
1276 # For now, always pick the highest quality.
1277 rendition = renditions[-1]
1278
1279 try:
1280 _,_,ext = rendition.attrib['type'].partition('/')
1281 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1282 video_url = rendition.find('./src').text
1283 except KeyError:
1284 raise ExtractorError('Invalid rendition field.')
1285
1286 info = {
1287 'id': video_id,
1288 'url': video_url,
1289 'uploader': performer,
1290 'upload_date': None,
1291 'title': video_title,
1292 'ext': ext,
1293 'format': format,
1294 }
1295
1296 return [info]
1297
1298
1299 class YoukuIE(InfoExtractor):
1300 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1301
1302 def _gen_sid(self):
1303 nowTime = int(time.time() * 1000)
1304 random1 = random.randint(1000,1998)
1305 random2 = random.randint(1000,9999)
1306
1307 return "%d%d%d" %(nowTime,random1,random2)
1308
1309 def _get_file_ID_mix_string(self, seed):
1310 mixed = []
1311 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1312 seed = float(seed)
1313 for i in range(len(source)):
1314 seed = (seed * 211 + 30031 ) % 65536
1315 index = math.floor(seed / 65536 * len(source) )
1316 mixed.append(source[int(index)])
1317 source.remove(source[int(index)])
1318 #return ''.join(mixed)
1319 return mixed
1320
1321 def _get_file_id(self, fileId, seed):
1322 mixed = self._get_file_ID_mix_string(seed)
1323 ids = fileId.split('*')
1324 realId = []
1325 for ch in ids:
1326 if ch:
1327 realId.append(mixed[int(ch)])
1328 return ''.join(realId)
1329
1330 def _real_extract(self, url):
1331 mobj = re.match(self._VALID_URL, url)
1332 if mobj is None:
1333 raise ExtractorError(u'Invalid URL: %s' % url)
1334 video_id = mobj.group('ID')
1335
1336 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1337
1338 jsondata = self._download_webpage(info_url, video_id)
1339
1340 self.report_extraction(video_id)
1341 try:
1342 config = json.loads(jsondata)
1343
1344 video_title = config['data'][0]['title']
1345 seed = config['data'][0]['seed']
1346
1347 format = self._downloader.params.get('format', None)
1348 supported_format = list(config['data'][0]['streamfileids'].keys())
1349
1350 if format is None or format == 'best':
1351 if 'hd2' in supported_format:
1352 format = 'hd2'
1353 else:
1354 format = 'flv'
1355 ext = u'flv'
1356 elif format == 'worst':
1357 format = 'mp4'
1358 ext = u'mp4'
1359 else:
1360 format = 'flv'
1361 ext = u'flv'
1362
1363
1364 fileid = config['data'][0]['streamfileids'][format]
1365 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1366 except (UnicodeDecodeError, ValueError, KeyError):
1367 raise ExtractorError(u'Unable to extract info section')
1368
1369 files_info=[]
1370 sid = self._gen_sid()
1371 fileid = self._get_file_id(fileid, seed)
1372
1373 #column 8,9 of fileid represent the segment number
1374 #fileid[7:9] should be changed
1375 for index, key in enumerate(keys):
1376
1377 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1378 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1379
1380 info = {
1381 'id': '%s_part%02d' % (video_id, index),
1382 'url': download_url,
1383 'uploader': None,
1384 'upload_date': None,
1385 'title': video_title,
1386 'ext': ext,
1387 }
1388 files_info.append(info)
1389
1390 return files_info
1391
1392
1393 class XNXXIE(InfoExtractor):
1394 """Information extractor for xnxx.com"""
1395
1396 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1397 IE_NAME = u'xnxx'
1398 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1399 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1400 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1401
1402 def _real_extract(self, url):
1403 mobj = re.match(self._VALID_URL, url)
1404 if mobj is None:
1405 raise ExtractorError(u'Invalid URL: %s' % url)
1406 video_id = mobj.group(1)
1407
1408 # Get webpage content
1409 webpage = self._download_webpage(url, video_id)
1410
1411 video_url = self._search_regex(self.VIDEO_URL_RE,
1412 webpage, u'video URL')
1413 video_url = compat_urllib_parse.unquote(video_url)
1414
1415 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1416 webpage, u'title')
1417
1418 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1419 webpage, u'thumbnail', fatal=False)
1420
1421 return [{
1422 'id': video_id,
1423 'url': video_url,
1424 'uploader': None,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': 'flv',
1428 'thumbnail': video_thumbnail,
1429 'description': None,
1430 }]
1431
1432
1433 class GooglePlusIE(InfoExtractor):
1434 """Information extractor for plus.google.com."""
1435
1436 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1437 IE_NAME = u'plus.google'
1438
1439 def _real_extract(self, url):
1440 # Extract id from URL
1441 mobj = re.match(self._VALID_URL, url)
1442 if mobj is None:
1443 raise ExtractorError(u'Invalid URL: %s' % url)
1444
1445 post_url = mobj.group(0)
1446 video_id = mobj.group(1)
1447
1448 video_extension = 'flv'
1449
1450 # Step 1, Retrieve post webpage to extract further information
1451 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1452
1453 self.report_extraction(video_id)
1454
1455 # Extract update date
1456 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1457 webpage, u'upload date', fatal=False)
1458 if upload_date:
1459 # Convert timestring to a format suitable for filename
1460 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1461 upload_date = upload_date.strftime('%Y%m%d')
1462
1463 # Extract uploader
1464 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1465 webpage, u'uploader', fatal=False)
1466
1467 # Extract title
1468 # Get the first line for title
1469 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1470 webpage, 'title', default=u'NA')
1471
1472 # Step 2, Stimulate clicking the image box to launch video
1473 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1474 webpage, u'video page URL')
1475 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1476
1477 # Extract video links on video page
1478 """Extract video links of all sizes"""
1479 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1480 mobj = re.findall(pattern, webpage)
1481 if len(mobj) == 0:
1482 raise ExtractorError(u'Unable to extract video links')
1483
1484 # Sort in resolution
1485 links = sorted(mobj)
1486
1487 # Choose the lowest of the sort, i.e. highest resolution
1488 video_url = links[-1]
1489 # Only get the url. The resolution part in the tuple has no use anymore
1490 video_url = video_url[-1]
1491 # Treat escaped \u0026 style hex
1492 try:
1493 video_url = video_url.decode("unicode_escape")
1494 except AttributeError: # Python 3
1495 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1496
1497
1498 return [{
1499 'id': video_id,
1500 'url': video_url,
1501 'uploader': uploader,
1502 'upload_date': upload_date,
1503 'title': video_title,
1504 'ext': video_extension,
1505 }]
1506
1507 class NBAIE(InfoExtractor):
1508 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1509 IE_NAME = u'nba'
1510
1511 def _real_extract(self, url):
1512 mobj = re.match(self._VALID_URL, url)
1513 if mobj is None:
1514 raise ExtractorError(u'Invalid URL: %s' % url)
1515
1516 video_id = mobj.group(1)
1517
1518 webpage = self._download_webpage(url, video_id)
1519
1520 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1521
1522 shortened_video_id = video_id.rpartition('/')[2]
1523 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1524 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1525
1526 # It isn't there in the HTML it returns to us
1527 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1528
1529 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1530
1531 info = {
1532 'id': shortened_video_id,
1533 'url': video_url,
1534 'ext': 'mp4',
1535 'title': title,
1536 # 'uploader_date': uploader_date,
1537 'description': description,
1538 }
1539 return [info]
1540
1541 class JustinTVIE(InfoExtractor):
1542 """Information extractor for justin.tv and twitch.tv"""
1543 # TODO: One broadcast may be split into multiple videos. The key
1544 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1545 # starts at 1 and increases. Can we treat all parts as one video?
1546
1547 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1548 (?:
1549 (?P<channelid>[^/]+)|
1550 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1551 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1552 )
1553 /?(?:\#.*)?$
1554 """
1555 _JUSTIN_PAGE_LIMIT = 100
1556 IE_NAME = u'justin.tv'
1557
1558 def report_download_page(self, channel, offset):
1559 """Report attempt to download a single page of videos."""
1560 self.to_screen(u'%s: Downloading video information from %d to %d' %
1561 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1562
1563 # Return count of items, list of *valid* items
1564 def _parse_page(self, url, video_id):
1565 webpage = self._download_webpage(url, video_id,
1566 u'Downloading video info JSON',
1567 u'unable to download video info JSON')
1568
1569 response = json.loads(webpage)
1570 if type(response) != list:
1571 error_text = response.get('error', 'unknown error')
1572 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1573 info = []
1574 for clip in response:
1575 video_url = clip['video_file_url']
1576 if video_url:
1577 video_extension = os.path.splitext(video_url)[1][1:]
1578 video_date = re.sub('-', '', clip['start_time'][:10])
1579 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1580 video_id = clip['id']
1581 video_title = clip.get('title', video_id)
1582 info.append({
1583 'id': video_id,
1584 'url': video_url,
1585 'title': video_title,
1586 'uploader': clip.get('channel_name', video_uploader_id),
1587 'uploader_id': video_uploader_id,
1588 'upload_date': video_date,
1589 'ext': video_extension,
1590 })
1591 return (len(response), info)
1592
1593 def _real_extract(self, url):
1594 mobj = re.match(self._VALID_URL, url)
1595 if mobj is None:
1596 raise ExtractorError(u'invalid URL: %s' % url)
1597
1598 api_base = 'http://api.justin.tv'
1599 paged = False
1600 if mobj.group('channelid'):
1601 paged = True
1602 video_id = mobj.group('channelid')
1603 api = api_base + '/channel/archives/%s.json' % video_id
1604 elif mobj.group('chapterid'):
1605 chapter_id = mobj.group('chapterid')
1606
1607 webpage = self._download_webpage(url, chapter_id)
1608 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1609 if not m:
1610 raise ExtractorError(u'Cannot find archive of a chapter')
1611 archive_id = m.group(1)
1612
1613 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1614 chapter_info_xml = self._download_webpage(api, chapter_id,
1615 note=u'Downloading chapter information',
1616 errnote=u'Chapter information download failed')
1617 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1618 for a in doc.findall('.//archive'):
1619 if archive_id == a.find('./id').text:
1620 break
1621 else:
1622 raise ExtractorError(u'Could not find chapter in chapter information')
1623
1624 video_url = a.find('./video_file_url').text
1625 video_ext = video_url.rpartition('.')[2] or u'flv'
1626
1627 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1628 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1629 note='Downloading chapter metadata',
1630 errnote='Download of chapter metadata failed')
1631 chapter_info = json.loads(chapter_info_json)
1632
1633 bracket_start = int(doc.find('.//bracket_start').text)
1634 bracket_end = int(doc.find('.//bracket_end').text)
1635
1636 # TODO determine start (and probably fix up file)
1637 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1638 #video_url += u'?start=' + TODO:start_timestamp
1639 # bracket_start is 13290, but we want 51670615
1640 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1641 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1642
1643 info = {
1644 'id': u'c' + chapter_id,
1645 'url': video_url,
1646 'ext': video_ext,
1647 'title': chapter_info['title'],
1648 'thumbnail': chapter_info['preview'],
1649 'description': chapter_info['description'],
1650 'uploader': chapter_info['channel']['display_name'],
1651 'uploader_id': chapter_info['channel']['name'],
1652 }
1653 return [info]
1654 else:
1655 video_id = mobj.group('videoid')
1656 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1657
1658 self.report_extraction(video_id)
1659
1660 info = []
1661 offset = 0
1662 limit = self._JUSTIN_PAGE_LIMIT
1663 while True:
1664 if paged:
1665 self.report_download_page(video_id, offset)
1666 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1667 page_count, page_info = self._parse_page(page_url, video_id)
1668 info.extend(page_info)
1669 if not paged or page_count != limit:
1670 break
1671 offset += limit
1672 return info
1673
1674 class FunnyOrDieIE(InfoExtractor):
1675 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1676
1677 def _real_extract(self, url):
1678 mobj = re.match(self._VALID_URL, url)
1679 if mobj is None:
1680 raise ExtractorError(u'invalid URL: %s' % url)
1681
1682 video_id = mobj.group('id')
1683 webpage = self._download_webpage(url, video_id)
1684
1685 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1686 webpage, u'video URL', flags=re.DOTALL)
1687
1688 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1689 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1690
1691 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1692 webpage, u'description', fatal=False, flags=re.DOTALL)
1693
1694 info = {
1695 'id': video_id,
1696 'url': video_url,
1697 'ext': 'mp4',
1698 'title': title,
1699 'description': video_description,
1700 }
1701 return [info]
1702
1703 class SteamIE(InfoExtractor):
1704 _VALID_URL = r"""http://store\.steampowered\.com/
1705 (agecheck/)?
1706 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1707 (?P<gameID>\d+)/?
1708 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1709 """
1710 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1711 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1712
1713 @classmethod
1714 def suitable(cls, url):
1715 """Receives a URL and returns True if suitable for this IE."""
1716 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1717
1718 def _real_extract(self, url):
1719 m = re.match(self._VALID_URL, url, re.VERBOSE)
1720 gameID = m.group('gameID')
1721
1722 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1723 webpage = self._download_webpage(videourl, gameID)
1724
1725 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1726 videourl = self._AGECHECK_TEMPLATE % gameID
1727 self.report_age_confirmation()
1728 webpage = self._download_webpage(videourl, gameID)
1729
1730 self.report_extraction(gameID)
1731 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1732 webpage, 'game title')
1733
1734 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1735 mweb = re.finditer(urlRE, webpage)
1736 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1737 titles = re.finditer(namesRE, webpage)
1738 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1739 thumbs = re.finditer(thumbsRE, webpage)
1740 videos = []
1741 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1742 video_id = vid.group('videoID')
1743 title = vtitle.group('videoName')
1744 video_url = vid.group('videoURL')
1745 video_thumb = thumb.group('thumbnail')
1746 if not video_url:
1747 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1748 info = {
1749 'id':video_id,
1750 'url':video_url,
1751 'ext': 'flv',
1752 'title': unescapeHTML(title),
1753 'thumbnail': video_thumb
1754 }
1755 videos.append(info)
1756 return [self.playlist_result(videos, gameID, game_title)]
1757
1758 class UstreamIE(InfoExtractor):
1759 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1760 IE_NAME = u'ustream'
1761
1762 def _real_extract(self, url):
1763 m = re.match(self._VALID_URL, url)
1764 video_id = m.group('videoID')
1765
1766 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1767 webpage = self._download_webpage(url, video_id)
1768
1769 self.report_extraction(video_id)
1770
1771 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1772 webpage, u'title')
1773
1774 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1775 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1776
1777 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1778 webpage, u'thumbnail', fatal=False)
1779
1780 info = {
1781 'id': video_id,
1782 'url': video_url,
1783 'ext': 'flv',
1784 'title': video_title,
1785 'uploader': uploader,
1786 'thumbnail': thumbnail,
1787 }
1788 return info
1789
1790 class WorldStarHipHopIE(InfoExtractor):
1791 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1792 IE_NAME = u'WorldStarHipHop'
1793
1794 def _real_extract(self, url):
1795 m = re.match(self._VALID_URL, url)
1796 video_id = m.group('id')
1797
1798 webpage_src = self._download_webpage(url, video_id)
1799
1800 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1801 webpage_src, u'video URL')
1802
1803 if 'mp4' in video_url:
1804 ext = 'mp4'
1805 else:
1806 ext = 'flv'
1807
1808 video_title = self._html_search_regex(r"<title>(.*)</title>",
1809 webpage_src, u'title')
1810
1811 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1812 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1813 webpage_src, u'thumbnail', fatal=False)
1814
1815 if not thumbnail:
1816 _title = r"""candytitles.*>(.*)</span>"""
1817 mobj = re.search(_title, webpage_src)
1818 if mobj is not None:
1819 video_title = mobj.group(1)
1820
1821 results = [{
1822 'id': video_id,
1823 'url' : video_url,
1824 'title' : video_title,
1825 'thumbnail' : thumbnail,
1826 'ext' : ext,
1827 }]
1828 return results
1829
1830 class RBMARadioIE(InfoExtractor):
1831 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1832
1833 def _real_extract(self, url):
1834 m = re.match(self._VALID_URL, url)
1835 video_id = m.group('videoID')
1836
1837 webpage = self._download_webpage(url, video_id)
1838
1839 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1840 webpage, u'json data', flags=re.MULTILINE)
1841
1842 try:
1843 data = json.loads(json_data)
1844 except ValueError as e:
1845 raise ExtractorError(u'Invalid JSON: ' + str(e))
1846
1847 video_url = data['akamai_url'] + '&cbr=256'
1848 url_parts = compat_urllib_parse_urlparse(video_url)
1849 video_ext = url_parts.path.rpartition('.')[2]
1850 info = {
1851 'id': video_id,
1852 'url': video_url,
1853 'ext': video_ext,
1854 'title': data['title'],
1855 'description': data.get('teaser_text'),
1856 'location': data.get('country_of_origin'),
1857 'uploader': data.get('host', {}).get('name'),
1858 'uploader_id': data.get('host', {}).get('slug'),
1859 'thumbnail': data.get('image', {}).get('large_url_2x'),
1860 'duration': data.get('duration'),
1861 }
1862 return [info]
1863
1864
1865 class YouPornIE(InfoExtractor):
1866 """Information extractor for youporn.com."""
1867 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1868
1869 def _print_formats(self, formats):
1870 """Print all available formats"""
1871 print(u'Available formats:')
1872 print(u'ext\t\tformat')
1873 print(u'---------------------------------')
1874 for format in formats:
1875 print(u'%s\t\t%s' % (format['ext'], format['format']))
1876
1877 def _specific(self, req_format, formats):
1878 for x in formats:
1879 if(x["format"]==req_format):
1880 return x
1881 return None
1882
1883 def _real_extract(self, url):
1884 mobj = re.match(self._VALID_URL, url)
1885 if mobj is None:
1886 raise ExtractorError(u'Invalid URL: %s' % url)
1887 video_id = mobj.group('videoid')
1888
1889 req = compat_urllib_request.Request(url)
1890 req.add_header('Cookie', 'age_verified=1')
1891 webpage = self._download_webpage(req, video_id)
1892
1893 # Get JSON parameters
1894 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1895 try:
1896 params = json.loads(json_params)
1897 except:
1898 raise ExtractorError(u'Invalid JSON')
1899
1900 self.report_extraction(video_id)
1901 try:
1902 video_title = params['title']
1903 upload_date = unified_strdate(params['release_date_f'])
1904 video_description = params['description']
1905 video_uploader = params['submitted_by']
1906 thumbnail = params['thumbnails'][0]['image']
1907 except KeyError:
1908 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1909
1910 # Get all of the formats available
1911 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1912 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1913 webpage, u'download list').strip()
1914
1915 # Get all of the links from the page
1916 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1917 links = re.findall(LINK_RE, download_list_html)
1918 if(len(links) == 0):
1919 raise ExtractorError(u'ERROR: no known formats available for video')
1920
1921 self.to_screen(u'Links found: %d' % len(links))
1922
1923 formats = []
1924 for link in links:
1925
1926 # A link looks like this:
1927 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1928 # A path looks like this:
1929 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1930 video_url = unescapeHTML( link )
1931 path = compat_urllib_parse_urlparse( video_url ).path
1932 extension = os.path.splitext( path )[1][1:]
1933 format = path.split('/')[4].split('_')[:2]
1934 size = format[0]
1935 bitrate = format[1]
1936 format = "-".join( format )
1937 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1938
1939 formats.append({
1940 'id': video_id,
1941 'url': video_url,
1942 'uploader': video_uploader,
1943 'upload_date': upload_date,
1944 'title': video_title,
1945 'ext': extension,
1946 'format': format,
1947 'thumbnail': thumbnail,
1948 'description': video_description
1949 })
1950
1951 if self._downloader.params.get('listformats', None):
1952 self._print_formats(formats)
1953 return
1954
1955 req_format = self._downloader.params.get('format', None)
1956 self.to_screen(u'Format: %s' % req_format)
1957
1958 if req_format is None or req_format == 'best':
1959 return [formats[0]]
1960 elif req_format == 'worst':
1961 return [formats[-1]]
1962 elif req_format in ('-1', 'all'):
1963 return formats
1964 else:
1965 format = self._specific( req_format, formats )
1966 if result is None:
1967 raise ExtractorError(u'Requested format not available')
1968 return [format]
1969
1970
1971
1972 class PornotubeIE(InfoExtractor):
1973 """Information extractor for pornotube.com."""
1974 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1975
1976 def _real_extract(self, url):
1977 mobj = re.match(self._VALID_URL, url)
1978 if mobj is None:
1979 raise ExtractorError(u'Invalid URL: %s' % url)
1980
1981 video_id = mobj.group('videoid')
1982 video_title = mobj.group('title')
1983
1984 # Get webpage content
1985 webpage = self._download_webpage(url, video_id)
1986
1987 # Get the video URL
1988 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1989 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1990 video_url = compat_urllib_parse.unquote(video_url)
1991
1992 #Get the uploaded date
1993 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1994 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1995 if upload_date: upload_date = unified_strdate(upload_date)
1996
1997 info = {'id': video_id,
1998 'url': video_url,
1999 'uploader': None,
2000 'upload_date': upload_date,
2001 'title': video_title,
2002 'ext': 'flv',
2003 'format': 'flv'}
2004
2005 return [info]
2006
2007 class YouJizzIE(InfoExtractor):
2008 """Information extractor for youjizz.com."""
2009 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2010
2011 def _real_extract(self, url):
2012 mobj = re.match(self._VALID_URL, url)
2013 if mobj is None:
2014 raise ExtractorError(u'Invalid URL: %s' % url)
2015
2016 video_id = mobj.group('videoid')
2017
2018 # Get webpage content
2019 webpage = self._download_webpage(url, video_id)
2020
2021 # Get the video title
2022 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2023 webpage, u'title').strip()
2024
2025 # Get the embed page
2026 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2027 if result is None:
2028 raise ExtractorError(u'ERROR: unable to extract embed page')
2029
2030 embed_page_url = result.group(0).strip()
2031 video_id = result.group('videoid')
2032
2033 webpage = self._download_webpage(embed_page_url, video_id)
2034
2035 # Get the video URL
2036 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2037 webpage, u'video URL')
2038
2039 info = {'id': video_id,
2040 'url': video_url,
2041 'title': video_title,
2042 'ext': 'flv',
2043 'format': 'flv',
2044 'player_url': embed_page_url}
2045
2046 return [info]
2047
2048 class EightTracksIE(InfoExtractor):
2049 IE_NAME = '8tracks'
2050 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2051
2052 def _real_extract(self, url):
2053 mobj = re.match(self._VALID_URL, url)
2054 if mobj is None:
2055 raise ExtractorError(u'Invalid URL: %s' % url)
2056 playlist_id = mobj.group('id')
2057
2058 webpage = self._download_webpage(url, playlist_id)
2059
2060 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2061 data = json.loads(json_like)
2062
2063 session = str(random.randint(0, 1000000000))
2064 mix_id = data['id']
2065 track_count = data['tracks_count']
2066 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2067 next_url = first_url
2068 res = []
2069 for i in itertools.count():
2070 api_json = self._download_webpage(next_url, playlist_id,
2071 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2072 errnote=u'Failed to download song information')
2073 api_data = json.loads(api_json)
2074 track_data = api_data[u'set']['track']
2075 info = {
2076 'id': track_data['id'],
2077 'url': track_data['track_file_stream_url'],
2078 'title': track_data['performer'] + u' - ' + track_data['name'],
2079 'raw_title': track_data['name'],
2080 'uploader_id': data['user']['login'],
2081 'ext': 'm4a',
2082 }
2083 res.append(info)
2084 if api_data['set']['at_last_track']:
2085 break
2086 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2087 return res
2088
2089 class KeekIE(InfoExtractor):
2090 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2091 IE_NAME = u'keek'
2092
2093 def _real_extract(self, url):
2094 m = re.match(self._VALID_URL, url)
2095 video_id = m.group('videoID')
2096
2097 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2098 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2099 webpage = self._download_webpage(url, video_id)
2100
2101 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2102 webpage, u'title')
2103
2104 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2105 webpage, u'uploader', fatal=False)
2106
2107 info = {
2108 'id': video_id,
2109 'url': video_url,
2110 'ext': 'mp4',
2111 'title': video_title,
2112 'thumbnail': thumbnail,
2113 'uploader': uploader
2114 }
2115 return [info]
2116
2117 class TEDIE(InfoExtractor):
2118 _VALID_URL=r'''http://www\.ted\.com/
2119 (
2120 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2121 |
2122 ((?P<type_talk>talks)) # We have a simple talk
2123 )
2124 (/lang/(.*?))? # The url may contain the language
2125 /(?P<name>\w+) # Here goes the name and then ".html"
2126 '''
2127
2128 @classmethod
2129 def suitable(cls, url):
2130 """Receives a URL and returns True if suitable for this IE."""
2131 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2132
2133 def _real_extract(self, url):
2134 m=re.match(self._VALID_URL, url, re.VERBOSE)
2135 if m.group('type_talk'):
2136 return [self._talk_info(url)]
2137 else :
2138 playlist_id=m.group('playlist_id')
2139 name=m.group('name')
2140 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2141 return [self._playlist_videos_info(url,name,playlist_id)]
2142
2143 def _playlist_videos_info(self,url,name,playlist_id=0):
2144 '''Returns the videos of the playlist'''
2145 video_RE=r'''
2146 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2147 ([.\s]*?)data-playlist_item_id="(\d+)"
2148 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2149 '''
2150 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2151 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2152 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2153 m_names=re.finditer(video_name_RE,webpage)
2154
2155 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2156 webpage, 'playlist title')
2157
2158 playlist_entries = []
2159 for m_video, m_name in zip(m_videos,m_names):
2160 video_id=m_video.group('video_id')
2161 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2162 playlist_entries.append(self.url_result(talk_url, 'TED'))
2163 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2164
2165 def _talk_info(self, url, video_id=0):
2166 """Return the video for the talk in the url"""
2167 m = re.match(self._VALID_URL, url,re.VERBOSE)
2168 video_name = m.group('name')
2169 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2170 self.report_extraction(video_name)
2171 # If the url includes the language we get the title translated
2172 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2173 webpage, 'title')
2174 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2175 webpage, 'json data')
2176 info = json.loads(json_data)
2177 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2178 webpage, 'description', flags = re.DOTALL)
2179
2180 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2181 webpage, 'thumbnail')
2182 info = {
2183 'id': info['id'],
2184 'url': info['htmlStreams'][-1]['file'],
2185 'ext': 'mp4',
2186 'title': title,
2187 'thumbnail': thumbnail,
2188 'description': desc,
2189 }
2190 return info
2191
2192 class MySpassIE(InfoExtractor):
2193 _VALID_URL = r'http://www.myspass.de/.*'
2194
2195 def _real_extract(self, url):
2196 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2197
2198 # video id is the last path element of the URL
2199 # usually there is a trailing slash, so also try the second but last
2200 url_path = compat_urllib_parse_urlparse(url).path
2201 url_parent_path, video_id = os.path.split(url_path)
2202 if not video_id:
2203 _, video_id = os.path.split(url_parent_path)
2204
2205 # get metadata
2206 metadata_url = META_DATA_URL_TEMPLATE % video_id
2207 metadata_text = self._download_webpage(metadata_url, video_id)
2208 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2209
2210 # extract values from metadata
2211 url_flv_el = metadata.find('url_flv')
2212 if url_flv_el is None:
2213 raise ExtractorError(u'Unable to extract download url')
2214 video_url = url_flv_el.text
2215 extension = os.path.splitext(video_url)[1][1:]
2216 title_el = metadata.find('title')
2217 if title_el is None:
2218 raise ExtractorError(u'Unable to extract title')
2219 title = title_el.text
2220 format_id_el = metadata.find('format_id')
2221 if format_id_el is None:
2222 format = ext
2223 else:
2224 format = format_id_el.text
2225 description_el = metadata.find('description')
2226 if description_el is not None:
2227 description = description_el.text
2228 else:
2229 description = None
2230 imagePreview_el = metadata.find('imagePreview')
2231 if imagePreview_el is not None:
2232 thumbnail = imagePreview_el.text
2233 else:
2234 thumbnail = None
2235 info = {
2236 'id': video_id,
2237 'url': video_url,
2238 'title': title,
2239 'ext': extension,
2240 'format': format,
2241 'thumbnail': thumbnail,
2242 'description': description
2243 }
2244 return [info]
2245
2246 class SpiegelIE(InfoExtractor):
2247 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2248
2249 def _real_extract(self, url):
2250 m = re.match(self._VALID_URL, url)
2251 video_id = m.group('videoID')
2252
2253 webpage = self._download_webpage(url, video_id)
2254
2255 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2256 webpage, u'title')
2257
2258 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2259 xml_code = self._download_webpage(xml_url, video_id,
2260 note=u'Downloading XML', errnote=u'Failed to download XML')
2261
2262 idoc = xml.etree.ElementTree.fromstring(xml_code)
2263 last_type = idoc[-1]
2264 filename = last_type.findall('./filename')[0].text
2265 duration = float(last_type.findall('./duration')[0].text)
2266
2267 video_url = 'http://video2.spiegel.de/flash/' + filename
2268 video_ext = filename.rpartition('.')[2]
2269 info = {
2270 'id': video_id,
2271 'url': video_url,
2272 'ext': video_ext,
2273 'title': video_title,
2274 'duration': duration,
2275 }
2276 return [info]
2277
2278 class LiveLeakIE(InfoExtractor):
2279
2280 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2281 IE_NAME = u'liveleak'
2282
2283 def _real_extract(self, url):
2284 mobj = re.match(self._VALID_URL, url)
2285 if mobj is None:
2286 raise ExtractorError(u'Invalid URL: %s' % url)
2287
2288 video_id = mobj.group('video_id')
2289
2290 webpage = self._download_webpage(url, video_id)
2291
2292 video_url = self._search_regex(r'file: "(.*?)",',
2293 webpage, u'video URL')
2294
2295 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2296 webpage, u'title').replace('LiveLeak.com -', '').strip()
2297
2298 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2299 webpage, u'description', fatal=False)
2300
2301 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2302 webpage, u'uploader', fatal=False)
2303
2304 info = {
2305 'id': video_id,
2306 'url': video_url,
2307 'ext': 'mp4',
2308 'title': video_title,
2309 'description': video_description,
2310 'uploader': video_uploader
2311 }
2312
2313 return [info]
2314
2315
2316
2317 class TumblrIE(InfoExtractor):
2318 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2319
2320 def _real_extract(self, url):
2321 m_url = re.match(self._VALID_URL, url)
2322 video_id = m_url.group('id')
2323 blog = m_url.group('blog_name')
2324
2325 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2326 webpage = self._download_webpage(url, video_id)
2327
2328 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2329 video = re.search(re_video, webpage)
2330 if video is None:
2331 raise ExtractorError(u'Unable to extract video')
2332 video_url = video.group('video_url')
2333 ext = video.group('ext')
2334
2335 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2336 webpage, u'thumbnail', fatal=False) # We pick the first poster
2337 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2338
2339 # The only place where you can get a title, it's not complete,
2340 # but searching in other places doesn't work for all videos
2341 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2342 webpage, u'title', flags=re.DOTALL)
2343
2344 return [{'id': video_id,
2345 'url': video_url,
2346 'title': video_title,
2347 'thumbnail': video_thumbnail,
2348 'ext': ext
2349 }]
2350
2351 class BandcampIE(InfoExtractor):
2352 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2353
2354 def _real_extract(self, url):
2355 mobj = re.match(self._VALID_URL, url)
2356 title = mobj.group('title')
2357 webpage = self._download_webpage(url, title)
2358 # We get the link to the free download page
2359 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2360 if m_download is None:
2361 raise ExtractorError(u'No free songs found')
2362
2363 download_link = m_download.group(1)
2364 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2365 webpage, re.MULTILINE|re.DOTALL).group('id')
2366
2367 download_webpage = self._download_webpage(download_link, id,
2368 'Downloading free downloads page')
2369 # We get the dictionary of the track from some javascrip code
2370 info = re.search(r'items: (.*?),$',
2371 download_webpage, re.MULTILINE).group(1)
2372 info = json.loads(info)[0]
2373 # We pick mp3-320 for now, until format selection can be easily implemented.
2374 mp3_info = info[u'downloads'][u'mp3-320']
2375 # If we try to use this url it says the link has expired
2376 initial_url = mp3_info[u'url']
2377 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2378 m_url = re.match(re_url, initial_url)
2379 #We build the url we will use to get the final track url
2380 # This url is build in Bandcamp in the script download_bunde_*.js
2381 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2382 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2383 # If we could correctly generate the .rand field the url would be
2384 #in the "download_url" key
2385 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2386
2387 track_info = {'id':id,
2388 'title' : info[u'title'],
2389 'ext' : 'mp3',
2390 'url' : final_url,
2391 'thumbnail' : info[u'thumb_url'],
2392 'uploader' : info[u'artist']
2393 }
2394
2395 return [track_info]
2396
2397 class RedTubeIE(InfoExtractor):
2398 """Information Extractor for redtube"""
2399 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2400
2401 def _real_extract(self,url):
2402 mobj = re.match(self._VALID_URL, url)
2403 if mobj is None:
2404 raise ExtractorError(u'Invalid URL: %s' % url)
2405
2406 video_id = mobj.group('id')
2407 video_extension = 'mp4'
2408 webpage = self._download_webpage(url, video_id)
2409
2410 self.report_extraction(video_id)
2411
2412 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2413 webpage, u'video URL')
2414
2415 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2416 webpage, u'title')
2417
2418 return [{
2419 'id': video_id,
2420 'url': video_url,
2421 'ext': video_extension,
2422 'title': video_title,
2423 }]
2424
2425 class InaIE(InfoExtractor):
2426 """Information Extractor for Ina.fr"""
2427 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2428
2429 def _real_extract(self,url):
2430 mobj = re.match(self._VALID_URL, url)
2431
2432 video_id = mobj.group('id')
2433 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2434 video_extension = 'mp4'
2435 webpage = self._download_webpage(mrss_url, video_id)
2436
2437 self.report_extraction(video_id)
2438
2439 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2440 webpage, u'video URL')
2441
2442 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2443 webpage, u'title')
2444
2445 return [{
2446 'id': video_id,
2447 'url': video_url,
2448 'ext': video_extension,
2449 'title': video_title,
2450 }]
2451
2452 class HowcastIE(InfoExtractor):
2453 """Information Extractor for Howcast.com"""
2454 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2455
2456 def _real_extract(self, url):
2457 mobj = re.match(self._VALID_URL, url)
2458
2459 video_id = mobj.group('id')
2460 webpage_url = 'http://www.howcast.com/videos/' + video_id
2461 webpage = self._download_webpage(webpage_url, video_id)
2462
2463 self.report_extraction(video_id)
2464
2465 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2466 webpage, u'video URL')
2467
2468 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2469 webpage, u'title')
2470
2471 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2472 webpage, u'description', fatal=False)
2473
2474 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2475 webpage, u'thumbnail', fatal=False)
2476
2477 return [{
2478 'id': video_id,
2479 'url': video_url,
2480 'ext': 'mp4',
2481 'title': video_title,
2482 'description': video_description,
2483 'thumbnail': thumbnail,
2484 }]
2485
2486 class VineIE(InfoExtractor):
2487 """Information Extractor for Vine.co"""
2488 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2489
2490 def _real_extract(self, url):
2491 mobj = re.match(self._VALID_URL, url)
2492
2493 video_id = mobj.group('id')
2494 webpage_url = 'https://vine.co/v/' + video_id
2495 webpage = self._download_webpage(webpage_url, video_id)
2496
2497 self.report_extraction(video_id)
2498
2499 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2500 webpage, u'video URL')
2501
2502 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2503 webpage, u'title')
2504
2505 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2506 webpage, u'thumbnail', fatal=False)
2507
2508 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2509 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2510
2511 return [{
2512 'id': video_id,
2513 'url': video_url,
2514 'ext': 'mp4',
2515 'title': video_title,
2516 'thumbnail': thumbnail,
2517 'uploader': uploader,
2518 }]
2519
2520 class FlickrIE(InfoExtractor):
2521 """Information Extractor for Flickr videos"""
2522 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2523
2524 def _real_extract(self, url):
2525 mobj = re.match(self._VALID_URL, url)
2526
2527 video_id = mobj.group('id')
2528 video_uploader_id = mobj.group('uploader_id')
2529 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2530 webpage = self._download_webpage(webpage_url, video_id)
2531
2532 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2533
2534 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2535 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2536
2537 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2538 first_xml, u'node_id')
2539
2540 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2541 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2542
2543 self.report_extraction(video_id)
2544
2545 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2546 if mobj is None:
2547 raise ExtractorError(u'Unable to extract video url')
2548 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2549
2550 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2551 webpage, u'video title')
2552
2553 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2554 webpage, u'description', fatal=False)
2555
2556 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2557 webpage, u'thumbnail', fatal=False)
2558
2559 return [{
2560 'id': video_id,
2561 'url': video_url,
2562 'ext': 'mp4',
2563 'title': video_title,
2564 'description': video_description,
2565 'thumbnail': thumbnail,
2566 'uploader_id': video_uploader_id,
2567 }]
2568
2569 class TeamcocoIE(InfoExtractor):
2570 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2571
2572 def _real_extract(self, url):
2573 mobj = re.match(self._VALID_URL, url)
2574 if mobj is None:
2575 raise ExtractorError(u'Invalid URL: %s' % url)
2576 url_title = mobj.group('url_title')
2577 webpage = self._download_webpage(url, url_title)
2578
2579 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2580 webpage, u'video id')
2581
2582 self.report_extraction(video_id)
2583
2584 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2585 webpage, u'title')
2586
2587 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2588 webpage, u'thumbnail', fatal=False)
2589
2590 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2591 webpage, u'description', fatal=False)
2592
2593 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2594 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2595
2596 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2597 data, u'video URL')
2598
2599 return [{
2600 'id': video_id,
2601 'url': video_url,
2602 'ext': 'mp4',
2603 'title': video_title,
2604 'thumbnail': thumbnail,
2605 'description': video_description,
2606 }]
2607
2608 class XHamsterIE(InfoExtractor):
2609 """Information Extractor for xHamster"""
2610 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2611
2612 def _real_extract(self,url):
2613 mobj = re.match(self._VALID_URL, url)
2614
2615 video_id = mobj.group('id')
2616 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2617 webpage = self._download_webpage(mrss_url, video_id)
2618
2619 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2620 if mobj is None:
2621 raise ExtractorError(u'Unable to extract media URL')
2622 if len(mobj.group('server')) == 0:
2623 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2624 else:
2625 video_url = mobj.group('server')+'/key='+mobj.group('file')
2626 video_extension = video_url.split('.')[-1]
2627
2628 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2629 webpage, u'title')
2630
2631 # Can't see the description anywhere in the UI
2632 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2633 # webpage, u'description', fatal=False)
2634 # if video_description: video_description = unescapeHTML(video_description)
2635
2636 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2637 if mobj:
2638 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2639 else:
2640 video_upload_date = None
2641 self._downloader.report_warning(u'Unable to extract upload date')
2642
2643 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2644 webpage, u'uploader id', default=u'anonymous')
2645
2646 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2647 webpage, u'thumbnail', fatal=False)
2648
2649 return [{
2650 'id': video_id,
2651 'url': video_url,
2652 'ext': video_extension,
2653 'title': video_title,
2654 # 'description': video_description,
2655 'upload_date': video_upload_date,
2656 'uploader_id': video_uploader_id,
2657 'thumbnail': video_thumbnail
2658 }]
2659
2660 class HypemIE(InfoExtractor):
2661 """Information Extractor for hypem"""
2662 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2663
2664 def _real_extract(self, url):
2665 mobj = re.match(self._VALID_URL, url)
2666 if mobj is None:
2667 raise ExtractorError(u'Invalid URL: %s' % url)
2668 track_id = mobj.group(1)
2669
2670 data = { 'ax': 1, 'ts': time.time() }
2671 data_encoded = compat_urllib_parse.urlencode(data)
2672 complete_url = url + "?" + data_encoded
2673 request = compat_urllib_request.Request(complete_url)
2674 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2675 cookie = urlh.headers.get('Set-Cookie', '')
2676
2677 self.report_extraction(track_id)
2678
2679 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2680 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2681 try:
2682 track_list = json.loads(html_tracks)
2683 track = track_list[u'tracks'][0]
2684 except ValueError:
2685 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2686
2687 key = track[u"key"]
2688 track_id = track[u"id"]
2689 artist = track[u"artist"]
2690 title = track[u"song"]
2691
2692 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2693 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2694 request.add_header('cookie', cookie)
2695 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2696 try:
2697 song_data = json.loads(song_data_json)
2698 except ValueError:
2699 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2700 final_url = song_data[u"url"]
2701
2702 return [{
2703 'id': track_id,
2704 'url': final_url,
2705 'ext': "mp3",
2706 'title': title,
2707 'artist': artist,
2708 }]
2709
2710 class Vbox7IE(InfoExtractor):
2711 """Information Extractor for Vbox7"""
2712 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2713
2714 def _real_extract(self,url):
2715 mobj = re.match(self._VALID_URL, url)
2716 if mobj is None:
2717 raise ExtractorError(u'Invalid URL: %s' % url)
2718 video_id = mobj.group(1)
2719
2720 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2721 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2722 redirect_url = urlh.geturl() + new_location
2723 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2724
2725 title = self._html_search_regex(r'<title>(.*)</title>',
2726 webpage, u'title').split('/')[0].strip()
2727
2728 ext = "flv"
2729 info_url = "http://vbox7.com/play/magare.do"
2730 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2731 info_request = compat_urllib_request.Request(info_url, data)
2732 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2733 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2734 if info_response is None:
2735 raise ExtractorError(u'Unable to extract the media url')
2736 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2737
2738 return [{
2739 'id': video_id,
2740 'url': final_url,
2741 'ext': ext,
2742 'title': title,
2743 'thumbnail': thumbnail_url,
2744 }]
2745
2746
2747 def gen_extractors():
2748 """ Return a list of an instance of every supported extractor.
2749 The order does matter; the first extractor matched is the one handling the URL.
2750 """
2751 return [
2752 YoutubePlaylistIE(),
2753 YoutubeChannelIE(),
2754 YoutubeUserIE(),
2755 YoutubeSearchIE(),
2756 YoutubeIE(),
2757 MetacafeIE(),
2758 DailymotionIE(),
2759 GoogleSearchIE(),
2760 PhotobucketIE(),
2761 YahooIE(),
2762 YahooSearchIE(),
2763 DepositFilesIE(),
2764 FacebookIE(),
2765 BlipTVIE(),
2766 BlipTVUserIE(),
2767 VimeoIE(),
2768 MyVideoIE(),
2769 ComedyCentralIE(),
2770 EscapistIE(),
2771 CollegeHumorIE(),
2772 XVideosIE(),
2773 SoundcloudSetIE(),
2774 SoundcloudIE(),
2775 InfoQIE(),
2776 MixcloudIE(),
2777 StanfordOpenClassroomIE(),
2778 MTVIE(),
2779 YoukuIE(),
2780 XNXXIE(),
2781 YouJizzIE(),
2782 PornotubeIE(),
2783 YouPornIE(),
2784 GooglePlusIE(),
2785 ArteTvIE(),
2786 NBAIE(),
2787 WorldStarHipHopIE(),
2788 JustinTVIE(),
2789 FunnyOrDieIE(),
2790 SteamIE(),
2791 UstreamIE(),
2792 RBMARadioIE(),
2793 EightTracksIE(),
2794 KeekIE(),
2795 TEDIE(),
2796 MySpassIE(),
2797 SpiegelIE(),
2798 LiveLeakIE(),
2799 ARDIE(),
2800 ZDFIE(),
2801 TumblrIE(),
2802 BandcampIE(),
2803 RedTubeIE(),
2804 InaIE(),
2805 HowcastIE(),
2806 VineIE(),
2807 FlickrIE(),
2808 TeamcocoIE(),
2809 XHamsterIE(),
2810 HypemIE(),
2811 Vbox7IE(),
2812 GametrailersIE(),
2813 StatigramIE(),
2814 GenericIE()
2815 ]
2816
2817 def get_info_extractor(ie_name):
2818 """Returns the info extractor class with the given ie_name"""
2819 return globals()[ie_name+'IE']