]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Move GenericIE into its own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
25
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.gametrailers import GametrailersIE
30 from .extractor.generic import GenericIE
31 from .extractor.metacafe import MetacafeIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.vimeo import VimeoIE
35 from .extractor.yahoo import YahooIE
36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
37 from .extractor.zdf import ZDFIE
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 class GoogleSearchIE(SearchInfoExtractor):
53 """Information Extractor for Google Video search queries."""
54 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
55 _MAX_RESULTS = 1000
56 IE_NAME = u'video.google:search'
57 _SEARCH_KEY = 'gvsearch'
58
59 def _get_n_results(self, query, n):
60 """Get a specified number of results for a query"""
61
62 res = {
63 '_type': 'playlist',
64 'id': query,
65 'entries': []
66 }
67
68 for pagenum in itertools.count(1):
69 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
70 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
71 note='Downloading result page ' + str(pagenum))
72
73 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
74 e = {
75 '_type': 'url',
76 'url': mobj.group(1)
77 }
78 res['entries'].append(e)
79
80 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
81 return res
82
83 class YahooSearchIE(SearchInfoExtractor):
84 """Information Extractor for Yahoo! Video search queries."""
85
86 _MAX_RESULTS = 1000
87 IE_NAME = u'screen.yahoo:search'
88 _SEARCH_KEY = 'yvsearch'
89
90 def _get_n_results(self, query, n):
91 """Get a specified number of results for a query"""
92
93 res = {
94 '_type': 'playlist',
95 'id': query,
96 'entries': []
97 }
98 for pagenum in itertools.count(0):
99 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
100 webpage = self._download_webpage(result_url, query,
101 note='Downloading results page '+str(pagenum+1))
102 info = json.loads(webpage)
103 m = info[u'm']
104 results = info[u'results']
105
106 for (i, r) in enumerate(results):
107 if (pagenum * 30) +i >= n:
108 break
109 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
110 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
111 res['entries'].append(e)
112 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
113 break
114
115 return res
116
117
118 class BlipTVUserIE(InfoExtractor):
119 """Information Extractor for blip.tv users."""
120
121 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
122 _PAGE_SIZE = 12
123 IE_NAME = u'blip.tv:user'
124
125 def _real_extract(self, url):
126 # Extract username
127 mobj = re.match(self._VALID_URL, url)
128 if mobj is None:
129 raise ExtractorError(u'Invalid URL: %s' % url)
130
131 username = mobj.group(1)
132
133 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
134
135 page = self._download_webpage(url, username, u'Downloading user page')
136 mobj = re.search(r'data-users-id="([^"]+)"', page)
137 page_base = page_base % mobj.group(1)
138
139
140 # Download video ids using BlipTV Ajax calls. Result size per
141 # query is limited (currently to 12 videos) so we need to query
142 # page by page until there are no video ids - it means we got
143 # all of them.
144
145 video_ids = []
146 pagenum = 1
147
148 while True:
149 url = page_base + "&page=" + str(pagenum)
150 page = self._download_webpage(url, username,
151 u'Downloading video ids from page %d' % pagenum)
152
153 # Extract video identifiers
154 ids_in_page = []
155
156 for mobj in re.finditer(r'href="/([^"]+)"', page):
157 if mobj.group(1) not in ids_in_page:
158 ids_in_page.append(unescapeHTML(mobj.group(1)))
159
160 video_ids.extend(ids_in_page)
161
162 # A little optimization - if current page is not
163 # "full", ie. does not contain PAGE_SIZE video ids then
164 # we can assume that this page is the last one - there
165 # are no more ids on further pages - no need to query
166 # again.
167
168 if len(ids_in_page) < self._PAGE_SIZE:
169 break
170
171 pagenum += 1
172
173 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
174 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
175 return [self.playlist_result(url_entries, playlist_title = username)]
176
177
178 class DepositFilesIE(InfoExtractor):
179 """Information extractor for depositfiles.com"""
180
181 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
182
183 def _real_extract(self, url):
184 file_id = url.split('/')[-1]
185 # Rebuild url in english locale
186 url = 'http://depositfiles.com/en/files/' + file_id
187
188 # Retrieve file webpage with 'Free download' button pressed
189 free_download_indication = { 'gateway_result' : '1' }
190 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
191 try:
192 self.report_download_webpage(file_id)
193 webpage = compat_urllib_request.urlopen(request).read()
194 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
195 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
196
197 # Search for the real file URL
198 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
199 if (mobj is None) or (mobj.group(1) is None):
200 # Try to figure out reason of the error.
201 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
202 if (mobj is not None) and (mobj.group(1) is not None):
203 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
204 raise ExtractorError(u'%s' % restriction_message)
205 else:
206 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
207
208 file_url = mobj.group(1)
209 file_extension = os.path.splitext(file_url)[1][1:]
210
211 # Search for file title
212 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
213
214 return [{
215 'id': file_id.decode('utf-8'),
216 'url': file_url.decode('utf-8'),
217 'uploader': None,
218 'upload_date': None,
219 'title': file_title,
220 'ext': file_extension.decode('utf-8'),
221 }]
222
223
224 class FacebookIE(InfoExtractor):
225 """Information Extractor for Facebook"""
226
227 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
228 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
229 _NETRC_MACHINE = 'facebook'
230 IE_NAME = u'facebook'
231
232 def report_login(self):
233 """Report attempt to log in."""
234 self.to_screen(u'Logging in')
235
236 def _real_initialize(self):
237 if self._downloader is None:
238 return
239
240 useremail = None
241 password = None
242 downloader_params = self._downloader.params
243
244 # Attempt to use provided username and password or .netrc data
245 if downloader_params.get('username', None) is not None:
246 useremail = downloader_params['username']
247 password = downloader_params['password']
248 elif downloader_params.get('usenetrc', False):
249 try:
250 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
251 if info is not None:
252 useremail = info[0]
253 password = info[2]
254 else:
255 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
256 except (IOError, netrc.NetrcParseError) as err:
257 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
258 return
259
260 if useremail is None:
261 return
262
263 # Log in
264 login_form = {
265 'email': useremail,
266 'pass': password,
267 'login': 'Log+In'
268 }
269 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
270 try:
271 self.report_login()
272 login_results = compat_urllib_request.urlopen(request).read()
273 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
274 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
275 return
276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
277 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
278 return
279
280 def _real_extract(self, url):
281 mobj = re.match(self._VALID_URL, url)
282 if mobj is None:
283 raise ExtractorError(u'Invalid URL: %s' % url)
284 video_id = mobj.group('ID')
285
286 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
287 webpage = self._download_webpage(url, video_id)
288
289 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
290 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
291 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
292 if not m:
293 raise ExtractorError(u'Cannot parse data')
294 data = dict(json.loads(m.group(1)))
295 params_raw = compat_urllib_parse.unquote(data['params'])
296 params = json.loads(params_raw)
297 video_data = params['video_data'][0]
298 video_url = video_data.get('hd_src')
299 if not video_url:
300 video_url = video_data['sd_src']
301 if not video_url:
302 raise ExtractorError(u'Cannot find video URL')
303 video_duration = int(video_data['video_duration'])
304 thumbnail = video_data['thumbnail_src']
305
306 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
307 webpage, u'title')
308
309 info = {
310 'id': video_id,
311 'title': video_title,
312 'url': video_url,
313 'ext': 'mp4',
314 'duration': video_duration,
315 'thumbnail': thumbnail,
316 }
317 return [info]
318
319
320 class BlipTVIE(InfoExtractor):
321 """Information extractor for blip.tv"""
322
323 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
324 _URL_EXT = r'^.*\.([a-z0-9]+)$'
325 IE_NAME = u'blip.tv'
326
327 def report_direct_download(self, title):
328 """Report information extraction."""
329 self.to_screen(u'%s: Direct download detected' % title)
330
331 def _real_extract(self, url):
332 mobj = re.match(self._VALID_URL, url)
333 if mobj is None:
334 raise ExtractorError(u'Invalid URL: %s' % url)
335
336 # See https://github.com/rg3/youtube-dl/issues/857
337 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
338 if api_mobj is not None:
339 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
340 urlp = compat_urllib_parse_urlparse(url)
341 if urlp.path.startswith('/play/'):
342 request = compat_urllib_request.Request(url)
343 response = compat_urllib_request.urlopen(request)
344 redirecturl = response.geturl()
345 rurlp = compat_urllib_parse_urlparse(redirecturl)
346 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
347 url = 'http://blip.tv/a/a-' + file_id
348 return self._real_extract(url)
349
350
351 if '?' in url:
352 cchar = '&'
353 else:
354 cchar = '?'
355 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
356 request = compat_urllib_request.Request(json_url)
357 request.add_header('User-Agent', 'iTunes/10.6.1')
358 self.report_extraction(mobj.group(1))
359 info = None
360 try:
361 urlh = compat_urllib_request.urlopen(request)
362 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
363 basename = url.split('/')[-1]
364 title,ext = os.path.splitext(basename)
365 title = title.decode('UTF-8')
366 ext = ext.replace('.', '')
367 self.report_direct_download(title)
368 info = {
369 'id': title,
370 'url': url,
371 'uploader': None,
372 'upload_date': None,
373 'title': title,
374 'ext': ext,
375 'urlhandle': urlh
376 }
377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
379 if info is None: # Regular URL
380 try:
381 json_code_bytes = urlh.read()
382 json_code = json_code_bytes.decode('utf-8')
383 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
384 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
385
386 try:
387 json_data = json.loads(json_code)
388 if 'Post' in json_data:
389 data = json_data['Post']
390 else:
391 data = json_data
392
393 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
394 video_url = data['media']['url']
395 umobj = re.match(self._URL_EXT, video_url)
396 if umobj is None:
397 raise ValueError('Can not determine filename extension')
398 ext = umobj.group(1)
399
400 info = {
401 'id': data['item_id'],
402 'url': video_url,
403 'uploader': data['display_name'],
404 'upload_date': upload_date,
405 'title': data['title'],
406 'ext': ext,
407 'format': data['media']['mimeType'],
408 'thumbnail': data['thumbnailUrl'],
409 'description': data['description'],
410 'player_url': data['embedUrl'],
411 'user_agent': 'iTunes/10.6.1',
412 }
413 except (ValueError,KeyError) as err:
414 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
415
416 return [info]
417
418
419 class MyVideoIE(InfoExtractor):
420 """Information Extractor for myvideo.de."""
421
422 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
423 IE_NAME = u'myvideo'
424
425 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
426 # Released into the Public Domain by Tristan Fischer on 2013-05-19
427 # https://github.com/rg3/youtube-dl/pull/842
428 def __rc4crypt(self,data, key):
429 x = 0
430 box = list(range(256))
431 for i in list(range(256)):
432 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
433 box[i], box[x] = box[x], box[i]
434 x = 0
435 y = 0
436 out = ''
437 for char in data:
438 x = (x + 1) % 256
439 y = (y + box[x]) % 256
440 box[x], box[y] = box[y], box[x]
441 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
442 return out
443
444 def __md5(self,s):
445 return hashlib.md5(s).hexdigest().encode()
446
447 def _real_extract(self,url):
448 mobj = re.match(self._VALID_URL, url)
449 if mobj is None:
450 raise ExtractorError(u'invalid URL: %s' % url)
451
452 video_id = mobj.group(1)
453
454 GK = (
455 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
456 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
457 b'TnpsbA0KTVRkbU1tSTRNdz09'
458 )
459
460 # Get video webpage
461 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
462 webpage = self._download_webpage(webpage_url, video_id)
463
464 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
465 if mobj is not None:
466 self.report_extraction(video_id)
467 video_url = mobj.group(1) + '.flv'
468
469 video_title = self._html_search_regex('<title>([^<]+)</title>',
470 webpage, u'title')
471
472 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
473
474 return [{
475 'id': video_id,
476 'url': video_url,
477 'uploader': None,
478 'upload_date': None,
479 'title': video_title,
480 'ext': u'flv',
481 }]
482
483 # try encxml
484 mobj = re.search('var flashvars={(.+?)}', webpage)
485 if mobj is None:
486 raise ExtractorError(u'Unable to extract video')
487
488 params = {}
489 encxml = ''
490 sec = mobj.group(1)
491 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
492 if not a == '_encxml':
493 params[a] = b
494 else:
495 encxml = compat_urllib_parse.unquote(b)
496 if not params.get('domain'):
497 params['domain'] = 'www.myvideo.de'
498 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
499 if 'flash_playertype=MTV' in xmldata_url:
500 self._downloader.report_warning(u'avoiding MTV player')
501 xmldata_url = (
502 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
503 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
504 ) % video_id
505
506 # get enc data
507 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
508 enc_data_b = binascii.unhexlify(enc_data)
509 sk = self.__md5(
510 base64.b64decode(base64.b64decode(GK)) +
511 self.__md5(
512 str(video_id).encode('utf-8')
513 )
514 )
515 dec_data = self.__rc4crypt(enc_data_b, sk)
516
517 # extracting infos
518 self.report_extraction(video_id)
519
520 video_url = None
521 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
522 if mobj:
523 video_url = compat_urllib_parse.unquote(mobj.group(1))
524 if 'myvideo2flash' in video_url:
525 self._downloader.report_warning(u'forcing RTMPT ...')
526 video_url = video_url.replace('rtmpe://', 'rtmpt://')
527
528 if not video_url:
529 # extract non rtmp videos
530 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
531 if mobj is None:
532 raise ExtractorError(u'unable to extract url')
533 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
534
535 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
536 video_file = compat_urllib_parse.unquote(video_file)
537
538 if not video_file.endswith('f4m'):
539 ppath, prefix = video_file.split('.')
540 video_playpath = '%s:%s' % (prefix, ppath)
541 video_hls_playlist = ''
542 else:
543 video_playpath = ''
544 video_hls_playlist = (
545 video_filepath + video_file
546 ).replace('.f4m', '.m3u8')
547
548 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
549 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
550
551 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
552 webpage, u'title')
553
554 return [{
555 'id': video_id,
556 'url': video_url,
557 'tc_url': video_url,
558 'uploader': None,
559 'upload_date': None,
560 'title': video_title,
561 'ext': u'flv',
562 'play_path': video_playpath,
563 'video_file': video_file,
564 'video_hls_playlist': video_hls_playlist,
565 'player_url': video_swfobj,
566 }]
567
568
569 class ComedyCentralIE(InfoExtractor):
570 """Information extractor for The Daily Show and Colbert Report """
571
572 # urls can be abbreviations like :thedailyshow or :colbert
573 # urls for episodes like:
574 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
575 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
576 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
577 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
578 |(https?://)?(www\.)?
579 (?P<showname>thedailyshow|colbertnation)\.com/
580 (full-episodes/(?P<episode>.*)|
581 (?P<clip>
582 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
583 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
584 $"""
585
586 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
587
588 _video_extensions = {
589 '3500': 'mp4',
590 '2200': 'mp4',
591 '1700': 'mp4',
592 '1200': 'mp4',
593 '750': 'mp4',
594 '400': 'mp4',
595 }
596 _video_dimensions = {
597 '3500': '1280x720',
598 '2200': '960x540',
599 '1700': '768x432',
600 '1200': '640x360',
601 '750': '512x288',
602 '400': '384x216',
603 }
604
605 @classmethod
606 def suitable(cls, url):
607 """Receives a URL and returns True if suitable for this IE."""
608 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
609
610 def _print_formats(self, formats):
611 print('Available formats:')
612 for x in formats:
613 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
614
615
616 def _real_extract(self, url):
617 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
618 if mobj is None:
619 raise ExtractorError(u'Invalid URL: %s' % url)
620
621 if mobj.group('shortname'):
622 if mobj.group('shortname') in ('tds', 'thedailyshow'):
623 url = u'http://www.thedailyshow.com/full-episodes/'
624 else:
625 url = u'http://www.colbertnation.com/full-episodes/'
626 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
627 assert mobj is not None
628
629 if mobj.group('clip'):
630 if mobj.group('showname') == 'thedailyshow':
631 epTitle = mobj.group('tdstitle')
632 else:
633 epTitle = mobj.group('cntitle')
634 dlNewest = False
635 else:
636 dlNewest = not mobj.group('episode')
637 if dlNewest:
638 epTitle = mobj.group('showname')
639 else:
640 epTitle = mobj.group('episode')
641
642 self.report_extraction(epTitle)
643 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
644 if dlNewest:
645 url = htmlHandle.geturl()
646 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
647 if mobj is None:
648 raise ExtractorError(u'Invalid redirected URL: ' + url)
649 if mobj.group('episode') == '':
650 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
651 epTitle = mobj.group('episode')
652
653 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
654
655 if len(mMovieParams) == 0:
656 # The Colbert Report embeds the information in a without
657 # a URL prefix; so extract the alternate reference
658 # and then add the URL prefix manually.
659
660 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
661 if len(altMovieParams) == 0:
662 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
663 else:
664 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
665
666 uri = mMovieParams[0][1]
667 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
668 indexXml = self._download_webpage(indexUrl, epTitle,
669 u'Downloading show index',
670 u'unable to download episode index')
671
672 results = []
673
674 idoc = xml.etree.ElementTree.fromstring(indexXml)
675 itemEls = idoc.findall('.//item')
676 for partNum,itemEl in enumerate(itemEls):
677 mediaId = itemEl.findall('./guid')[0].text
678 shortMediaId = mediaId.split(':')[-1]
679 showId = mediaId.split(':')[-2].replace('.com', '')
680 officialTitle = itemEl.findall('./title')[0].text
681 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
682
683 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
684 compat_urllib_parse.urlencode({'uri': mediaId}))
685 configXml = self._download_webpage(configUrl, epTitle,
686 u'Downloading configuration for %s' % shortMediaId)
687
688 cdoc = xml.etree.ElementTree.fromstring(configXml)
689 turls = []
690 for rendition in cdoc.findall('.//rendition'):
691 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
692 turls.append(finfo)
693
694 if len(turls) == 0:
695 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
696 continue
697
698 if self._downloader.params.get('listformats', None):
699 self._print_formats([i[0] for i in turls])
700 return
701
702 # For now, just pick the highest bitrate
703 format,rtmp_video_url = turls[-1]
704
705 # Get the format arg from the arg stream
706 req_format = self._downloader.params.get('format', None)
707
708 # Select format if we can find one
709 for f,v in turls:
710 if f == req_format:
711 format, rtmp_video_url = f, v
712 break
713
714 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
715 if not m:
716 raise ExtractorError(u'Cannot transform RTMP url')
717 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
718 video_url = base + m.group('finalid')
719
720 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
721 info = {
722 'id': shortMediaId,
723 'url': video_url,
724 'uploader': showId,
725 'upload_date': officialDate,
726 'title': effTitle,
727 'ext': 'mp4',
728 'format': format,
729 'thumbnail': None,
730 'description': officialTitle,
731 }
732 results.append(info)
733
734 return results
735
736
737 class EscapistIE(InfoExtractor):
738 """Information extractor for The Escapist """
739
740 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
741 IE_NAME = u'escapist'
742
743 def _real_extract(self, url):
744 mobj = re.match(self._VALID_URL, url)
745 if mobj is None:
746 raise ExtractorError(u'Invalid URL: %s' % url)
747 showName = mobj.group('showname')
748 videoId = mobj.group('episode')
749
750 self.report_extraction(videoId)
751 webpage = self._download_webpage(url, videoId)
752
753 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
754 webpage, u'description', fatal=False)
755
756 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
757 webpage, u'thumbnail', fatal=False)
758
759 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
760 webpage, u'player url')
761
762 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
763 webpage, u'player url').split(' : ')[-1]
764
765 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
766 configUrl = compat_urllib_parse.unquote(configUrl)
767
768 configJSON = self._download_webpage(configUrl, videoId,
769 u'Downloading configuration',
770 u'unable to download configuration')
771
772 # Technically, it's JavaScript, not JSON
773 configJSON = configJSON.replace("'", '"')
774
775 try:
776 config = json.loads(configJSON)
777 except (ValueError,) as err:
778 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
779
780 playlist = config['playlist']
781 videoUrl = playlist[1]['url']
782
783 info = {
784 'id': videoId,
785 'url': videoUrl,
786 'uploader': showName,
787 'upload_date': None,
788 'title': title,
789 'ext': 'mp4',
790 'thumbnail': imgUrl,
791 'description': videoDesc,
792 'player_url': playerUrl,
793 }
794
795 return [info]
796
797 class CollegeHumorIE(InfoExtractor):
798 """Information extractor for collegehumor.com"""
799
800 _WORKING = False
801 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
802 IE_NAME = u'collegehumor'
803
804 def report_manifest(self, video_id):
805 """Report information extraction."""
806 self.to_screen(u'%s: Downloading XML manifest' % video_id)
807
808 def _real_extract(self, url):
809 mobj = re.match(self._VALID_URL, url)
810 if mobj is None:
811 raise ExtractorError(u'Invalid URL: %s' % url)
812 video_id = mobj.group('videoid')
813
814 info = {
815 'id': video_id,
816 'uploader': None,
817 'upload_date': None,
818 }
819
820 self.report_extraction(video_id)
821 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
822 try:
823 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
824 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
825 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
826
827 mdoc = xml.etree.ElementTree.fromstring(metaXml)
828 try:
829 videoNode = mdoc.findall('./video')[0]
830 info['description'] = videoNode.findall('./description')[0].text
831 info['title'] = videoNode.findall('./caption')[0].text
832 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
833 manifest_url = videoNode.findall('./file')[0].text
834 except IndexError:
835 raise ExtractorError(u'Invalid metadata XML file')
836
837 manifest_url += '?hdcore=2.10.3'
838 self.report_manifest(video_id)
839 try:
840 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
841 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
842 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
843
844 adoc = xml.etree.ElementTree.fromstring(manifestXml)
845 try:
846 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
847 node_id = media_node.attrib['url']
848 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
849 except IndexError as err:
850 raise ExtractorError(u'Invalid manifest file')
851
852 url_pr = compat_urllib_parse_urlparse(manifest_url)
853 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
854
855 info['url'] = url
856 info['ext'] = 'f4f'
857 return [info]
858
859
860 class XVideosIE(InfoExtractor):
861 """Information extractor for xvideos.com"""
862
863 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
864 IE_NAME = u'xvideos'
865
866 def _real_extract(self, url):
867 mobj = re.match(self._VALID_URL, url)
868 if mobj is None:
869 raise ExtractorError(u'Invalid URL: %s' % url)
870 video_id = mobj.group(1)
871
872 webpage = self._download_webpage(url, video_id)
873
874 self.report_extraction(video_id)
875
876 # Extract video URL
877 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
878 webpage, u'video URL'))
879
880 # Extract title
881 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
882 webpage, u'title')
883
884 # Extract video thumbnail
885 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
886 webpage, u'thumbnail', fatal=False)
887
888 info = {
889 'id': video_id,
890 'url': video_url,
891 'uploader': None,
892 'upload_date': None,
893 'title': video_title,
894 'ext': 'flv',
895 'thumbnail': video_thumbnail,
896 'description': None,
897 }
898
899 return [info]
900
901
902 class SoundcloudIE(InfoExtractor):
903 """Information extractor for soundcloud.com
904 To access the media, the uid of the song and a stream token
905 must be extracted from the page source and the script must make
906 a request to media.soundcloud.com/crossdomain.xml. Then
907 the media can be grabbed by requesting from an url composed
908 of the stream token and uid
909 """
910
911 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
912 IE_NAME = u'soundcloud'
913
914 def report_resolve(self, video_id):
915 """Report information extraction."""
916 self.to_screen(u'%s: Resolving id' % video_id)
917
918 def _real_extract(self, url):
919 mobj = re.match(self._VALID_URL, url)
920 if mobj is None:
921 raise ExtractorError(u'Invalid URL: %s' % url)
922
923 # extract uploader (which is in the url)
924 uploader = mobj.group(1)
925 # extract simple title (uploader + slug of song title)
926 slug_title = mobj.group(2)
927 simple_title = uploader + u'-' + slug_title
928 full_title = '%s/%s' % (uploader, slug_title)
929
930 self.report_resolve(full_title)
931
932 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
933 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
934 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
935
936 info = json.loads(info_json)
937 video_id = info['id']
938 self.report_extraction(full_title)
939
940 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
941 stream_json = self._download_webpage(streams_url, full_title,
942 u'Downloading stream definitions',
943 u'unable to download stream definitions')
944
945 streams = json.loads(stream_json)
946 mediaURL = streams['http_mp3_128_url']
947 upload_date = unified_strdate(info['created_at'])
948
949 return [{
950 'id': info['id'],
951 'url': mediaURL,
952 'uploader': info['user']['username'],
953 'upload_date': upload_date,
954 'title': info['title'],
955 'ext': u'mp3',
956 'description': info['description'],
957 }]
958
959 class SoundcloudSetIE(InfoExtractor):
960 """Information extractor for soundcloud.com sets
961 To access the media, the uid of the song and a stream token
962 must be extracted from the page source and the script must make
963 a request to media.soundcloud.com/crossdomain.xml. Then
964 the media can be grabbed by requesting from an url composed
965 of the stream token and uid
966 """
967
968 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
969 IE_NAME = u'soundcloud:set'
970
971 def report_resolve(self, video_id):
972 """Report information extraction."""
973 self.to_screen(u'%s: Resolving id' % video_id)
974
975 def _real_extract(self, url):
976 mobj = re.match(self._VALID_URL, url)
977 if mobj is None:
978 raise ExtractorError(u'Invalid URL: %s' % url)
979
980 # extract uploader (which is in the url)
981 uploader = mobj.group(1)
982 # extract simple title (uploader + slug of song title)
983 slug_title = mobj.group(2)
984 simple_title = uploader + u'-' + slug_title
985 full_title = '%s/sets/%s' % (uploader, slug_title)
986
987 self.report_resolve(full_title)
988
989 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
990 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
991 info_json = self._download_webpage(resolv_url, full_title)
992
993 videos = []
994 info = json.loads(info_json)
995 if 'errors' in info:
996 for err in info['errors']:
997 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
998 return
999
1000 self.report_extraction(full_title)
1001 for track in info['tracks']:
1002 video_id = track['id']
1003
1004 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1005 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1006
1007 self.report_extraction(video_id)
1008 streams = json.loads(stream_json)
1009 mediaURL = streams['http_mp3_128_url']
1010
1011 videos.append({
1012 'id': video_id,
1013 'url': mediaURL,
1014 'uploader': track['user']['username'],
1015 'upload_date': unified_strdate(track['created_at']),
1016 'title': track['title'],
1017 'ext': u'mp3',
1018 'description': track['description'],
1019 })
1020 return videos
1021
1022
1023 class InfoQIE(InfoExtractor):
1024 """Information extractor for infoq.com"""
1025 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1026
1027 def _real_extract(self, url):
1028 mobj = re.match(self._VALID_URL, url)
1029 if mobj is None:
1030 raise ExtractorError(u'Invalid URL: %s' % url)
1031
1032 webpage = self._download_webpage(url, video_id=url)
1033 self.report_extraction(url)
1034
1035 # Extract video URL
1036 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1037 if mobj is None:
1038 raise ExtractorError(u'Unable to extract video url')
1039 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1040 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1041
1042 # Extract title
1043 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1044 webpage, u'title')
1045
1046 # Extract description
1047 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1048 webpage, u'description', fatal=False)
1049
1050 video_filename = video_url.split('/')[-1]
1051 video_id, extension = video_filename.split('.')
1052
1053 info = {
1054 'id': video_id,
1055 'url': video_url,
1056 'uploader': None,
1057 'upload_date': None,
1058 'title': video_title,
1059 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1060 'thumbnail': None,
1061 'description': video_description,
1062 }
1063
1064 return [info]
1065
1066 class MixcloudIE(InfoExtractor):
1067 """Information extractor for www.mixcloud.com"""
1068
1069 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1070 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1071 IE_NAME = u'mixcloud'
1072
1073 def report_download_json(self, file_id):
1074 """Report JSON download."""
1075 self.to_screen(u'Downloading json')
1076
1077 def get_urls(self, jsonData, fmt, bitrate='best'):
1078 """Get urls from 'audio_formats' section in json"""
1079 file_url = None
1080 try:
1081 bitrate_list = jsonData[fmt]
1082 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1083 bitrate = max(bitrate_list) # select highest
1084
1085 url_list = jsonData[fmt][bitrate]
1086 except TypeError: # we have no bitrate info.
1087 url_list = jsonData[fmt]
1088 return url_list
1089
1090 def check_urls(self, url_list):
1091 """Returns 1st active url from list"""
1092 for url in url_list:
1093 try:
1094 compat_urllib_request.urlopen(url)
1095 return url
1096 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1097 url = None
1098
1099 return None
1100
1101 def _print_formats(self, formats):
1102 print('Available formats:')
1103 for fmt in formats.keys():
1104 for b in formats[fmt]:
1105 try:
1106 ext = formats[fmt][b][0]
1107 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1108 except TypeError: # we have no bitrate info
1109 ext = formats[fmt][0]
1110 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1111 break
1112
1113 def _real_extract(self, url):
1114 mobj = re.match(self._VALID_URL, url)
1115 if mobj is None:
1116 raise ExtractorError(u'Invalid URL: %s' % url)
1117 # extract uploader & filename from url
1118 uploader = mobj.group(1).decode('utf-8')
1119 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1120
1121 # construct API request
1122 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1123 # retrieve .json file with links to files
1124 request = compat_urllib_request.Request(file_url)
1125 try:
1126 self.report_download_json(file_url)
1127 jsonData = compat_urllib_request.urlopen(request).read()
1128 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1130
1131 # parse JSON
1132 json_data = json.loads(jsonData)
1133 player_url = json_data['player_swf_url']
1134 formats = dict(json_data['audio_formats'])
1135
1136 req_format = self._downloader.params.get('format', None)
1137 bitrate = None
1138
1139 if self._downloader.params.get('listformats', None):
1140 self._print_formats(formats)
1141 return
1142
1143 if req_format is None or req_format == 'best':
1144 for format_param in formats.keys():
1145 url_list = self.get_urls(formats, format_param)
1146 # check urls
1147 file_url = self.check_urls(url_list)
1148 if file_url is not None:
1149 break # got it!
1150 else:
1151 if req_format not in formats:
1152 raise ExtractorError(u'Format is not available')
1153
1154 url_list = self.get_urls(formats, req_format)
1155 file_url = self.check_urls(url_list)
1156 format_param = req_format
1157
1158 return [{
1159 'id': file_id.decode('utf-8'),
1160 'url': file_url.decode('utf-8'),
1161 'uploader': uploader.decode('utf-8'),
1162 'upload_date': None,
1163 'title': json_data['name'],
1164 'ext': file_url.split('.')[-1].decode('utf-8'),
1165 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1166 'thumbnail': json_data['thumbnail_url'],
1167 'description': json_data['description'],
1168 'player_url': player_url.decode('utf-8'),
1169 }]
1170
1171 class StanfordOpenClassroomIE(InfoExtractor):
1172 """Information extractor for Stanford's Open ClassRoom"""
1173
1174 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1175 IE_NAME = u'stanfordoc'
1176
1177 def _real_extract(self, url):
1178 mobj = re.match(self._VALID_URL, url)
1179 if mobj is None:
1180 raise ExtractorError(u'Invalid URL: %s' % url)
1181
1182 if mobj.group('course') and mobj.group('video'): # A specific video
1183 course = mobj.group('course')
1184 video = mobj.group('video')
1185 info = {
1186 'id': course + '_' + video,
1187 'uploader': None,
1188 'upload_date': None,
1189 }
1190
1191 self.report_extraction(info['id'])
1192 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1193 xmlUrl = baseUrl + video + '.xml'
1194 try:
1195 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1196 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1197 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1198 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1199 try:
1200 info['title'] = mdoc.findall('./title')[0].text
1201 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1202 except IndexError:
1203 raise ExtractorError(u'Invalid metadata XML file')
1204 info['ext'] = info['url'].rpartition('.')[2]
1205 return [info]
1206 elif mobj.group('course'): # A course page
1207 course = mobj.group('course')
1208 info = {
1209 'id': course,
1210 'type': 'playlist',
1211 'uploader': None,
1212 'upload_date': None,
1213 }
1214
1215 coursepage = self._download_webpage(url, info['id'],
1216 note='Downloading course info page',
1217 errnote='Unable to download course info page')
1218
1219 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1220
1221 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1222 coursepage, u'description', fatal=False)
1223
1224 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1225 info['list'] = [
1226 {
1227 'type': 'reference',
1228 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1229 }
1230 for vpage in links]
1231 results = []
1232 for entry in info['list']:
1233 assert entry['type'] == 'reference'
1234 results += self.extract(entry['url'])
1235 return results
1236 else: # Root page
1237 info = {
1238 'id': 'Stanford OpenClassroom',
1239 'type': 'playlist',
1240 'uploader': None,
1241 'upload_date': None,
1242 }
1243
1244 self.report_download_webpage(info['id'])
1245 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1246 try:
1247 rootpage = compat_urllib_request.urlopen(rootURL).read()
1248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1249 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1250
1251 info['title'] = info['id']
1252
1253 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1254 info['list'] = [
1255 {
1256 'type': 'reference',
1257 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1258 }
1259 for cpage in links]
1260
1261 results = []
1262 for entry in info['list']:
1263 assert entry['type'] == 'reference'
1264 results += self.extract(entry['url'])
1265 return results
1266
1267 class MTVIE(InfoExtractor):
1268 """Information extractor for MTV.com"""
1269
1270 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1271 IE_NAME = u'mtv'
1272
1273 def _real_extract(self, url):
1274 mobj = re.match(self._VALID_URL, url)
1275 if mobj is None:
1276 raise ExtractorError(u'Invalid URL: %s' % url)
1277 if not mobj.group('proto'):
1278 url = 'http://' + url
1279 video_id = mobj.group('videoid')
1280
1281 webpage = self._download_webpage(url, video_id)
1282
1283 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1284 webpage, u'song name', fatal=False)
1285
1286 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1287 webpage, u'title')
1288
1289 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1290 webpage, u'mtvn_uri', fatal=False)
1291
1292 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1293 webpage, u'content id', fatal=False)
1294
1295 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1296 self.report_extraction(video_id)
1297 request = compat_urllib_request.Request(videogen_url)
1298 try:
1299 metadataXml = compat_urllib_request.urlopen(request).read()
1300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1301 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1302
1303 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1304 renditions = mdoc.findall('.//rendition')
1305
1306 # For now, always pick the highest quality.
1307 rendition = renditions[-1]
1308
1309 try:
1310 _,_,ext = rendition.attrib['type'].partition('/')
1311 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1312 video_url = rendition.find('./src').text
1313 except KeyError:
1314 raise ExtractorError('Invalid rendition field.')
1315
1316 info = {
1317 'id': video_id,
1318 'url': video_url,
1319 'uploader': performer,
1320 'upload_date': None,
1321 'title': video_title,
1322 'ext': ext,
1323 'format': format,
1324 }
1325
1326 return [info]
1327
1328
1329 class YoukuIE(InfoExtractor):
1330 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1331
1332 def _gen_sid(self):
1333 nowTime = int(time.time() * 1000)
1334 random1 = random.randint(1000,1998)
1335 random2 = random.randint(1000,9999)
1336
1337 return "%d%d%d" %(nowTime,random1,random2)
1338
1339 def _get_file_ID_mix_string(self, seed):
1340 mixed = []
1341 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1342 seed = float(seed)
1343 for i in range(len(source)):
1344 seed = (seed * 211 + 30031 ) % 65536
1345 index = math.floor(seed / 65536 * len(source) )
1346 mixed.append(source[int(index)])
1347 source.remove(source[int(index)])
1348 #return ''.join(mixed)
1349 return mixed
1350
1351 def _get_file_id(self, fileId, seed):
1352 mixed = self._get_file_ID_mix_string(seed)
1353 ids = fileId.split('*')
1354 realId = []
1355 for ch in ids:
1356 if ch:
1357 realId.append(mixed[int(ch)])
1358 return ''.join(realId)
1359
1360 def _real_extract(self, url):
1361 mobj = re.match(self._VALID_URL, url)
1362 if mobj is None:
1363 raise ExtractorError(u'Invalid URL: %s' % url)
1364 video_id = mobj.group('ID')
1365
1366 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1367
1368 jsondata = self._download_webpage(info_url, video_id)
1369
1370 self.report_extraction(video_id)
1371 try:
1372 config = json.loads(jsondata)
1373
1374 video_title = config['data'][0]['title']
1375 seed = config['data'][0]['seed']
1376
1377 format = self._downloader.params.get('format', None)
1378 supported_format = list(config['data'][0]['streamfileids'].keys())
1379
1380 if format is None or format == 'best':
1381 if 'hd2' in supported_format:
1382 format = 'hd2'
1383 else:
1384 format = 'flv'
1385 ext = u'flv'
1386 elif format == 'worst':
1387 format = 'mp4'
1388 ext = u'mp4'
1389 else:
1390 format = 'flv'
1391 ext = u'flv'
1392
1393
1394 fileid = config['data'][0]['streamfileids'][format]
1395 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1396 except (UnicodeDecodeError, ValueError, KeyError):
1397 raise ExtractorError(u'Unable to extract info section')
1398
1399 files_info=[]
1400 sid = self._gen_sid()
1401 fileid = self._get_file_id(fileid, seed)
1402
1403 #column 8,9 of fileid represent the segment number
1404 #fileid[7:9] should be changed
1405 for index, key in enumerate(keys):
1406
1407 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1408 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1409
1410 info = {
1411 'id': '%s_part%02d' % (video_id, index),
1412 'url': download_url,
1413 'uploader': None,
1414 'upload_date': None,
1415 'title': video_title,
1416 'ext': ext,
1417 }
1418 files_info.append(info)
1419
1420 return files_info
1421
1422
1423 class XNXXIE(InfoExtractor):
1424 """Information extractor for xnxx.com"""
1425
1426 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1427 IE_NAME = u'xnxx'
1428 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1429 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1430 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1431
1432 def _real_extract(self, url):
1433 mobj = re.match(self._VALID_URL, url)
1434 if mobj is None:
1435 raise ExtractorError(u'Invalid URL: %s' % url)
1436 video_id = mobj.group(1)
1437
1438 # Get webpage content
1439 webpage = self._download_webpage(url, video_id)
1440
1441 video_url = self._search_regex(self.VIDEO_URL_RE,
1442 webpage, u'video URL')
1443 video_url = compat_urllib_parse.unquote(video_url)
1444
1445 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1446 webpage, u'title')
1447
1448 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1449 webpage, u'thumbnail', fatal=False)
1450
1451 return [{
1452 'id': video_id,
1453 'url': video_url,
1454 'uploader': None,
1455 'upload_date': None,
1456 'title': video_title,
1457 'ext': 'flv',
1458 'thumbnail': video_thumbnail,
1459 'description': None,
1460 }]
1461
1462
1463 class GooglePlusIE(InfoExtractor):
1464 """Information extractor for plus.google.com."""
1465
1466 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1467 IE_NAME = u'plus.google'
1468
1469 def _real_extract(self, url):
1470 # Extract id from URL
1471 mobj = re.match(self._VALID_URL, url)
1472 if mobj is None:
1473 raise ExtractorError(u'Invalid URL: %s' % url)
1474
1475 post_url = mobj.group(0)
1476 video_id = mobj.group(1)
1477
1478 video_extension = 'flv'
1479
1480 # Step 1, Retrieve post webpage to extract further information
1481 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1482
1483 self.report_extraction(video_id)
1484
1485 # Extract update date
1486 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1487 webpage, u'upload date', fatal=False)
1488 if upload_date:
1489 # Convert timestring to a format suitable for filename
1490 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1491 upload_date = upload_date.strftime('%Y%m%d')
1492
1493 # Extract uploader
1494 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1495 webpage, u'uploader', fatal=False)
1496
1497 # Extract title
1498 # Get the first line for title
1499 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1500 webpage, 'title', default=u'NA')
1501
1502 # Step 2, Stimulate clicking the image box to launch video
1503 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1504 webpage, u'video page URL')
1505 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1506
1507 # Extract video links on video page
1508 """Extract video links of all sizes"""
1509 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1510 mobj = re.findall(pattern, webpage)
1511 if len(mobj) == 0:
1512 raise ExtractorError(u'Unable to extract video links')
1513
1514 # Sort in resolution
1515 links = sorted(mobj)
1516
1517 # Choose the lowest of the sort, i.e. highest resolution
1518 video_url = links[-1]
1519 # Only get the url. The resolution part in the tuple has no use anymore
1520 video_url = video_url[-1]
1521 # Treat escaped \u0026 style hex
1522 try:
1523 video_url = video_url.decode("unicode_escape")
1524 except AttributeError: # Python 3
1525 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1526
1527
1528 return [{
1529 'id': video_id,
1530 'url': video_url,
1531 'uploader': uploader,
1532 'upload_date': upload_date,
1533 'title': video_title,
1534 'ext': video_extension,
1535 }]
1536
1537 class NBAIE(InfoExtractor):
1538 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1539 IE_NAME = u'nba'
1540
1541 def _real_extract(self, url):
1542 mobj = re.match(self._VALID_URL, url)
1543 if mobj is None:
1544 raise ExtractorError(u'Invalid URL: %s' % url)
1545
1546 video_id = mobj.group(1)
1547
1548 webpage = self._download_webpage(url, video_id)
1549
1550 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1551
1552 shortened_video_id = video_id.rpartition('/')[2]
1553 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1554 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1555
1556 # It isn't there in the HTML it returns to us
1557 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1558
1559 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1560
1561 info = {
1562 'id': shortened_video_id,
1563 'url': video_url,
1564 'ext': 'mp4',
1565 'title': title,
1566 # 'uploader_date': uploader_date,
1567 'description': description,
1568 }
1569 return [info]
1570
1571 class JustinTVIE(InfoExtractor):
1572 """Information extractor for justin.tv and twitch.tv"""
1573 # TODO: One broadcast may be split into multiple videos. The key
1574 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1575 # starts at 1 and increases. Can we treat all parts as one video?
1576
1577 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1578 (?:
1579 (?P<channelid>[^/]+)|
1580 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1581 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1582 )
1583 /?(?:\#.*)?$
1584 """
1585 _JUSTIN_PAGE_LIMIT = 100
1586 IE_NAME = u'justin.tv'
1587
1588 def report_download_page(self, channel, offset):
1589 """Report attempt to download a single page of videos."""
1590 self.to_screen(u'%s: Downloading video information from %d to %d' %
1591 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1592
1593 # Return count of items, list of *valid* items
1594 def _parse_page(self, url, video_id):
1595 webpage = self._download_webpage(url, video_id,
1596 u'Downloading video info JSON',
1597 u'unable to download video info JSON')
1598
1599 response = json.loads(webpage)
1600 if type(response) != list:
1601 error_text = response.get('error', 'unknown error')
1602 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1603 info = []
1604 for clip in response:
1605 video_url = clip['video_file_url']
1606 if video_url:
1607 video_extension = os.path.splitext(video_url)[1][1:]
1608 video_date = re.sub('-', '', clip['start_time'][:10])
1609 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1610 video_id = clip['id']
1611 video_title = clip.get('title', video_id)
1612 info.append({
1613 'id': video_id,
1614 'url': video_url,
1615 'title': video_title,
1616 'uploader': clip.get('channel_name', video_uploader_id),
1617 'uploader_id': video_uploader_id,
1618 'upload_date': video_date,
1619 'ext': video_extension,
1620 })
1621 return (len(response), info)
1622
1623 def _real_extract(self, url):
1624 mobj = re.match(self._VALID_URL, url)
1625 if mobj is None:
1626 raise ExtractorError(u'invalid URL: %s' % url)
1627
1628 api_base = 'http://api.justin.tv'
1629 paged = False
1630 if mobj.group('channelid'):
1631 paged = True
1632 video_id = mobj.group('channelid')
1633 api = api_base + '/channel/archives/%s.json' % video_id
1634 elif mobj.group('chapterid'):
1635 chapter_id = mobj.group('chapterid')
1636
1637 webpage = self._download_webpage(url, chapter_id)
1638 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1639 if not m:
1640 raise ExtractorError(u'Cannot find archive of a chapter')
1641 archive_id = m.group(1)
1642
1643 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1644 chapter_info_xml = self._download_webpage(api, chapter_id,
1645 note=u'Downloading chapter information',
1646 errnote=u'Chapter information download failed')
1647 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1648 for a in doc.findall('.//archive'):
1649 if archive_id == a.find('./id').text:
1650 break
1651 else:
1652 raise ExtractorError(u'Could not find chapter in chapter information')
1653
1654 video_url = a.find('./video_file_url').text
1655 video_ext = video_url.rpartition('.')[2] or u'flv'
1656
1657 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1658 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1659 note='Downloading chapter metadata',
1660 errnote='Download of chapter metadata failed')
1661 chapter_info = json.loads(chapter_info_json)
1662
1663 bracket_start = int(doc.find('.//bracket_start').text)
1664 bracket_end = int(doc.find('.//bracket_end').text)
1665
1666 # TODO determine start (and probably fix up file)
1667 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1668 #video_url += u'?start=' + TODO:start_timestamp
1669 # bracket_start is 13290, but we want 51670615
1670 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1671 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1672
1673 info = {
1674 'id': u'c' + chapter_id,
1675 'url': video_url,
1676 'ext': video_ext,
1677 'title': chapter_info['title'],
1678 'thumbnail': chapter_info['preview'],
1679 'description': chapter_info['description'],
1680 'uploader': chapter_info['channel']['display_name'],
1681 'uploader_id': chapter_info['channel']['name'],
1682 }
1683 return [info]
1684 else:
1685 video_id = mobj.group('videoid')
1686 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1687
1688 self.report_extraction(video_id)
1689
1690 info = []
1691 offset = 0
1692 limit = self._JUSTIN_PAGE_LIMIT
1693 while True:
1694 if paged:
1695 self.report_download_page(video_id, offset)
1696 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1697 page_count, page_info = self._parse_page(page_url, video_id)
1698 info.extend(page_info)
1699 if not paged or page_count != limit:
1700 break
1701 offset += limit
1702 return info
1703
1704 class FunnyOrDieIE(InfoExtractor):
1705 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1706
1707 def _real_extract(self, url):
1708 mobj = re.match(self._VALID_URL, url)
1709 if mobj is None:
1710 raise ExtractorError(u'invalid URL: %s' % url)
1711
1712 video_id = mobj.group('id')
1713 webpage = self._download_webpage(url, video_id)
1714
1715 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1716 webpage, u'video URL', flags=re.DOTALL)
1717
1718 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1719 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1720
1721 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1722 webpage, u'description', fatal=False, flags=re.DOTALL)
1723
1724 info = {
1725 'id': video_id,
1726 'url': video_url,
1727 'ext': 'mp4',
1728 'title': title,
1729 'description': video_description,
1730 }
1731 return [info]
1732
1733 class SteamIE(InfoExtractor):
1734 _VALID_URL = r"""http://store\.steampowered\.com/
1735 (agecheck/)?
1736 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1737 (?P<gameID>\d+)/?
1738 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1739 """
1740 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1741 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1742
1743 @classmethod
1744 def suitable(cls, url):
1745 """Receives a URL and returns True if suitable for this IE."""
1746 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1747
1748 def _real_extract(self, url):
1749 m = re.match(self._VALID_URL, url, re.VERBOSE)
1750 gameID = m.group('gameID')
1751
1752 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1753 webpage = self._download_webpage(videourl, gameID)
1754
1755 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1756 videourl = self._AGECHECK_TEMPLATE % gameID
1757 self.report_age_confirmation()
1758 webpage = self._download_webpage(videourl, gameID)
1759
1760 self.report_extraction(gameID)
1761 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1762 webpage, 'game title')
1763
1764 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1765 mweb = re.finditer(urlRE, webpage)
1766 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1767 titles = re.finditer(namesRE, webpage)
1768 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1769 thumbs = re.finditer(thumbsRE, webpage)
1770 videos = []
1771 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1772 video_id = vid.group('videoID')
1773 title = vtitle.group('videoName')
1774 video_url = vid.group('videoURL')
1775 video_thumb = thumb.group('thumbnail')
1776 if not video_url:
1777 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1778 info = {
1779 'id':video_id,
1780 'url':video_url,
1781 'ext': 'flv',
1782 'title': unescapeHTML(title),
1783 'thumbnail': video_thumb
1784 }
1785 videos.append(info)
1786 return [self.playlist_result(videos, gameID, game_title)]
1787
1788 class UstreamIE(InfoExtractor):
1789 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1790 IE_NAME = u'ustream'
1791
1792 def _real_extract(self, url):
1793 m = re.match(self._VALID_URL, url)
1794 video_id = m.group('videoID')
1795
1796 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1797 webpage = self._download_webpage(url, video_id)
1798
1799 self.report_extraction(video_id)
1800
1801 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1802 webpage, u'title')
1803
1804 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1805 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1806
1807 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1808 webpage, u'thumbnail', fatal=False)
1809
1810 info = {
1811 'id': video_id,
1812 'url': video_url,
1813 'ext': 'flv',
1814 'title': video_title,
1815 'uploader': uploader,
1816 'thumbnail': thumbnail,
1817 }
1818 return info
1819
1820 class WorldStarHipHopIE(InfoExtractor):
1821 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1822 IE_NAME = u'WorldStarHipHop'
1823
1824 def _real_extract(self, url):
1825 m = re.match(self._VALID_URL, url)
1826 video_id = m.group('id')
1827
1828 webpage_src = self._download_webpage(url, video_id)
1829
1830 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1831 webpage_src, u'video URL')
1832
1833 if 'mp4' in video_url:
1834 ext = 'mp4'
1835 else:
1836 ext = 'flv'
1837
1838 video_title = self._html_search_regex(r"<title>(.*)</title>",
1839 webpage_src, u'title')
1840
1841 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1842 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1843 webpage_src, u'thumbnail', fatal=False)
1844
1845 if not thumbnail:
1846 _title = r"""candytitles.*>(.*)</span>"""
1847 mobj = re.search(_title, webpage_src)
1848 if mobj is not None:
1849 video_title = mobj.group(1)
1850
1851 results = [{
1852 'id': video_id,
1853 'url' : video_url,
1854 'title' : video_title,
1855 'thumbnail' : thumbnail,
1856 'ext' : ext,
1857 }]
1858 return results
1859
1860 class RBMARadioIE(InfoExtractor):
1861 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1862
1863 def _real_extract(self, url):
1864 m = re.match(self._VALID_URL, url)
1865 video_id = m.group('videoID')
1866
1867 webpage = self._download_webpage(url, video_id)
1868
1869 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1870 webpage, u'json data', flags=re.MULTILINE)
1871
1872 try:
1873 data = json.loads(json_data)
1874 except ValueError as e:
1875 raise ExtractorError(u'Invalid JSON: ' + str(e))
1876
1877 video_url = data['akamai_url'] + '&cbr=256'
1878 url_parts = compat_urllib_parse_urlparse(video_url)
1879 video_ext = url_parts.path.rpartition('.')[2]
1880 info = {
1881 'id': video_id,
1882 'url': video_url,
1883 'ext': video_ext,
1884 'title': data['title'],
1885 'description': data.get('teaser_text'),
1886 'location': data.get('country_of_origin'),
1887 'uploader': data.get('host', {}).get('name'),
1888 'uploader_id': data.get('host', {}).get('slug'),
1889 'thumbnail': data.get('image', {}).get('large_url_2x'),
1890 'duration': data.get('duration'),
1891 }
1892 return [info]
1893
1894
1895 class YouPornIE(InfoExtractor):
1896 """Information extractor for youporn.com."""
1897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1898
1899 def _print_formats(self, formats):
1900 """Print all available formats"""
1901 print(u'Available formats:')
1902 print(u'ext\t\tformat')
1903 print(u'---------------------------------')
1904 for format in formats:
1905 print(u'%s\t\t%s' % (format['ext'], format['format']))
1906
1907 def _specific(self, req_format, formats):
1908 for x in formats:
1909 if(x["format"]==req_format):
1910 return x
1911 return None
1912
1913 def _real_extract(self, url):
1914 mobj = re.match(self._VALID_URL, url)
1915 if mobj is None:
1916 raise ExtractorError(u'Invalid URL: %s' % url)
1917 video_id = mobj.group('videoid')
1918
1919 req = compat_urllib_request.Request(url)
1920 req.add_header('Cookie', 'age_verified=1')
1921 webpage = self._download_webpage(req, video_id)
1922
1923 # Get JSON parameters
1924 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1925 try:
1926 params = json.loads(json_params)
1927 except:
1928 raise ExtractorError(u'Invalid JSON')
1929
1930 self.report_extraction(video_id)
1931 try:
1932 video_title = params['title']
1933 upload_date = unified_strdate(params['release_date_f'])
1934 video_description = params['description']
1935 video_uploader = params['submitted_by']
1936 thumbnail = params['thumbnails'][0]['image']
1937 except KeyError:
1938 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1939
1940 # Get all of the formats available
1941 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1942 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1943 webpage, u'download list').strip()
1944
1945 # Get all of the links from the page
1946 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1947 links = re.findall(LINK_RE, download_list_html)
1948 if(len(links) == 0):
1949 raise ExtractorError(u'ERROR: no known formats available for video')
1950
1951 self.to_screen(u'Links found: %d' % len(links))
1952
1953 formats = []
1954 for link in links:
1955
1956 # A link looks like this:
1957 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1958 # A path looks like this:
1959 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1960 video_url = unescapeHTML( link )
1961 path = compat_urllib_parse_urlparse( video_url ).path
1962 extension = os.path.splitext( path )[1][1:]
1963 format = path.split('/')[4].split('_')[:2]
1964 size = format[0]
1965 bitrate = format[1]
1966 format = "-".join( format )
1967 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1968
1969 formats.append({
1970 'id': video_id,
1971 'url': video_url,
1972 'uploader': video_uploader,
1973 'upload_date': upload_date,
1974 'title': video_title,
1975 'ext': extension,
1976 'format': format,
1977 'thumbnail': thumbnail,
1978 'description': video_description
1979 })
1980
1981 if self._downloader.params.get('listformats', None):
1982 self._print_formats(formats)
1983 return
1984
1985 req_format = self._downloader.params.get('format', None)
1986 self.to_screen(u'Format: %s' % req_format)
1987
1988 if req_format is None or req_format == 'best':
1989 return [formats[0]]
1990 elif req_format == 'worst':
1991 return [formats[-1]]
1992 elif req_format in ('-1', 'all'):
1993 return formats
1994 else:
1995 format = self._specific( req_format, formats )
1996 if result is None:
1997 raise ExtractorError(u'Requested format not available')
1998 return [format]
1999
2000
2001
2002 class PornotubeIE(InfoExtractor):
2003 """Information extractor for pornotube.com."""
2004 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2005
2006 def _real_extract(self, url):
2007 mobj = re.match(self._VALID_URL, url)
2008 if mobj is None:
2009 raise ExtractorError(u'Invalid URL: %s' % url)
2010
2011 video_id = mobj.group('videoid')
2012 video_title = mobj.group('title')
2013
2014 # Get webpage content
2015 webpage = self._download_webpage(url, video_id)
2016
2017 # Get the video URL
2018 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2019 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2020 video_url = compat_urllib_parse.unquote(video_url)
2021
2022 #Get the uploaded date
2023 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2024 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2025 if upload_date: upload_date = unified_strdate(upload_date)
2026
2027 info = {'id': video_id,
2028 'url': video_url,
2029 'uploader': None,
2030 'upload_date': upload_date,
2031 'title': video_title,
2032 'ext': 'flv',
2033 'format': 'flv'}
2034
2035 return [info]
2036
2037 class YouJizzIE(InfoExtractor):
2038 """Information extractor for youjizz.com."""
2039 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2040
2041 def _real_extract(self, url):
2042 mobj = re.match(self._VALID_URL, url)
2043 if mobj is None:
2044 raise ExtractorError(u'Invalid URL: %s' % url)
2045
2046 video_id = mobj.group('videoid')
2047
2048 # Get webpage content
2049 webpage = self._download_webpage(url, video_id)
2050
2051 # Get the video title
2052 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2053 webpage, u'title').strip()
2054
2055 # Get the embed page
2056 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2057 if result is None:
2058 raise ExtractorError(u'ERROR: unable to extract embed page')
2059
2060 embed_page_url = result.group(0).strip()
2061 video_id = result.group('videoid')
2062
2063 webpage = self._download_webpage(embed_page_url, video_id)
2064
2065 # Get the video URL
2066 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2067 webpage, u'video URL')
2068
2069 info = {'id': video_id,
2070 'url': video_url,
2071 'title': video_title,
2072 'ext': 'flv',
2073 'format': 'flv',
2074 'player_url': embed_page_url}
2075
2076 return [info]
2077
2078 class EightTracksIE(InfoExtractor):
2079 IE_NAME = '8tracks'
2080 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2081
2082 def _real_extract(self, url):
2083 mobj = re.match(self._VALID_URL, url)
2084 if mobj is None:
2085 raise ExtractorError(u'Invalid URL: %s' % url)
2086 playlist_id = mobj.group('id')
2087
2088 webpage = self._download_webpage(url, playlist_id)
2089
2090 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2091 data = json.loads(json_like)
2092
2093 session = str(random.randint(0, 1000000000))
2094 mix_id = data['id']
2095 track_count = data['tracks_count']
2096 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2097 next_url = first_url
2098 res = []
2099 for i in itertools.count():
2100 api_json = self._download_webpage(next_url, playlist_id,
2101 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2102 errnote=u'Failed to download song information')
2103 api_data = json.loads(api_json)
2104 track_data = api_data[u'set']['track']
2105 info = {
2106 'id': track_data['id'],
2107 'url': track_data['track_file_stream_url'],
2108 'title': track_data['performer'] + u' - ' + track_data['name'],
2109 'raw_title': track_data['name'],
2110 'uploader_id': data['user']['login'],
2111 'ext': 'm4a',
2112 }
2113 res.append(info)
2114 if api_data['set']['at_last_track']:
2115 break
2116 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2117 return res
2118
2119 class KeekIE(InfoExtractor):
2120 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2121 IE_NAME = u'keek'
2122
2123 def _real_extract(self, url):
2124 m = re.match(self._VALID_URL, url)
2125 video_id = m.group('videoID')
2126
2127 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2128 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2129 webpage = self._download_webpage(url, video_id)
2130
2131 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2132 webpage, u'title')
2133
2134 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2135 webpage, u'uploader', fatal=False)
2136
2137 info = {
2138 'id': video_id,
2139 'url': video_url,
2140 'ext': 'mp4',
2141 'title': video_title,
2142 'thumbnail': thumbnail,
2143 'uploader': uploader
2144 }
2145 return [info]
2146
2147 class TEDIE(InfoExtractor):
2148 _VALID_URL=r'''http://www\.ted\.com/
2149 (
2150 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2151 |
2152 ((?P<type_talk>talks)) # We have a simple talk
2153 )
2154 (/lang/(.*?))? # The url may contain the language
2155 /(?P<name>\w+) # Here goes the name and then ".html"
2156 '''
2157
2158 @classmethod
2159 def suitable(cls, url):
2160 """Receives a URL and returns True if suitable for this IE."""
2161 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2162
2163 def _real_extract(self, url):
2164 m=re.match(self._VALID_URL, url, re.VERBOSE)
2165 if m.group('type_talk'):
2166 return [self._talk_info(url)]
2167 else :
2168 playlist_id=m.group('playlist_id')
2169 name=m.group('name')
2170 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2171 return [self._playlist_videos_info(url,name,playlist_id)]
2172
2173 def _playlist_videos_info(self,url,name,playlist_id=0):
2174 '''Returns the videos of the playlist'''
2175 video_RE=r'''
2176 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2177 ([.\s]*?)data-playlist_item_id="(\d+)"
2178 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2179 '''
2180 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2181 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2182 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2183 m_names=re.finditer(video_name_RE,webpage)
2184
2185 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2186 webpage, 'playlist title')
2187
2188 playlist_entries = []
2189 for m_video, m_name in zip(m_videos,m_names):
2190 video_id=m_video.group('video_id')
2191 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2192 playlist_entries.append(self.url_result(talk_url, 'TED'))
2193 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2194
2195 def _talk_info(self, url, video_id=0):
2196 """Return the video for the talk in the url"""
2197 m = re.match(self._VALID_URL, url,re.VERBOSE)
2198 video_name = m.group('name')
2199 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2200 self.report_extraction(video_name)
2201 # If the url includes the language we get the title translated
2202 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2203 webpage, 'title')
2204 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2205 webpage, 'json data')
2206 info = json.loads(json_data)
2207 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2208 webpage, 'description', flags = re.DOTALL)
2209
2210 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2211 webpage, 'thumbnail')
2212 info = {
2213 'id': info['id'],
2214 'url': info['htmlStreams'][-1]['file'],
2215 'ext': 'mp4',
2216 'title': title,
2217 'thumbnail': thumbnail,
2218 'description': desc,
2219 }
2220 return info
2221
2222 class MySpassIE(InfoExtractor):
2223 _VALID_URL = r'http://www.myspass.de/.*'
2224
2225 def _real_extract(self, url):
2226 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2227
2228 # video id is the last path element of the URL
2229 # usually there is a trailing slash, so also try the second but last
2230 url_path = compat_urllib_parse_urlparse(url).path
2231 url_parent_path, video_id = os.path.split(url_path)
2232 if not video_id:
2233 _, video_id = os.path.split(url_parent_path)
2234
2235 # get metadata
2236 metadata_url = META_DATA_URL_TEMPLATE % video_id
2237 metadata_text = self._download_webpage(metadata_url, video_id)
2238 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2239
2240 # extract values from metadata
2241 url_flv_el = metadata.find('url_flv')
2242 if url_flv_el is None:
2243 raise ExtractorError(u'Unable to extract download url')
2244 video_url = url_flv_el.text
2245 extension = os.path.splitext(video_url)[1][1:]
2246 title_el = metadata.find('title')
2247 if title_el is None:
2248 raise ExtractorError(u'Unable to extract title')
2249 title = title_el.text
2250 format_id_el = metadata.find('format_id')
2251 if format_id_el is None:
2252 format = ext
2253 else:
2254 format = format_id_el.text
2255 description_el = metadata.find('description')
2256 if description_el is not None:
2257 description = description_el.text
2258 else:
2259 description = None
2260 imagePreview_el = metadata.find('imagePreview')
2261 if imagePreview_el is not None:
2262 thumbnail = imagePreview_el.text
2263 else:
2264 thumbnail = None
2265 info = {
2266 'id': video_id,
2267 'url': video_url,
2268 'title': title,
2269 'ext': extension,
2270 'format': format,
2271 'thumbnail': thumbnail,
2272 'description': description
2273 }
2274 return [info]
2275
2276 class SpiegelIE(InfoExtractor):
2277 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2278
2279 def _real_extract(self, url):
2280 m = re.match(self._VALID_URL, url)
2281 video_id = m.group('videoID')
2282
2283 webpage = self._download_webpage(url, video_id)
2284
2285 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2286 webpage, u'title')
2287
2288 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2289 xml_code = self._download_webpage(xml_url, video_id,
2290 note=u'Downloading XML', errnote=u'Failed to download XML')
2291
2292 idoc = xml.etree.ElementTree.fromstring(xml_code)
2293 last_type = idoc[-1]
2294 filename = last_type.findall('./filename')[0].text
2295 duration = float(last_type.findall('./duration')[0].text)
2296
2297 video_url = 'http://video2.spiegel.de/flash/' + filename
2298 video_ext = filename.rpartition('.')[2]
2299 info = {
2300 'id': video_id,
2301 'url': video_url,
2302 'ext': video_ext,
2303 'title': video_title,
2304 'duration': duration,
2305 }
2306 return [info]
2307
2308 class LiveLeakIE(InfoExtractor):
2309
2310 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2311 IE_NAME = u'liveleak'
2312
2313 def _real_extract(self, url):
2314 mobj = re.match(self._VALID_URL, url)
2315 if mobj is None:
2316 raise ExtractorError(u'Invalid URL: %s' % url)
2317
2318 video_id = mobj.group('video_id')
2319
2320 webpage = self._download_webpage(url, video_id)
2321
2322 video_url = self._search_regex(r'file: "(.*?)",',
2323 webpage, u'video URL')
2324
2325 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2326 webpage, u'title').replace('LiveLeak.com -', '').strip()
2327
2328 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2329 webpage, u'description', fatal=False)
2330
2331 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2332 webpage, u'uploader', fatal=False)
2333
2334 info = {
2335 'id': video_id,
2336 'url': video_url,
2337 'ext': 'mp4',
2338 'title': video_title,
2339 'description': video_description,
2340 'uploader': video_uploader
2341 }
2342
2343 return [info]
2344
2345
2346
2347 class TumblrIE(InfoExtractor):
2348 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2349
2350 def _real_extract(self, url):
2351 m_url = re.match(self._VALID_URL, url)
2352 video_id = m_url.group('id')
2353 blog = m_url.group('blog_name')
2354
2355 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2356 webpage = self._download_webpage(url, video_id)
2357
2358 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2359 video = re.search(re_video, webpage)
2360 if video is None:
2361 raise ExtractorError(u'Unable to extract video')
2362 video_url = video.group('video_url')
2363 ext = video.group('ext')
2364
2365 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2366 webpage, u'thumbnail', fatal=False) # We pick the first poster
2367 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2368
2369 # The only place where you can get a title, it's not complete,
2370 # but searching in other places doesn't work for all videos
2371 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2372 webpage, u'title', flags=re.DOTALL)
2373
2374 return [{'id': video_id,
2375 'url': video_url,
2376 'title': video_title,
2377 'thumbnail': video_thumbnail,
2378 'ext': ext
2379 }]
2380
2381 class BandcampIE(InfoExtractor):
2382 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2383
2384 def _real_extract(self, url):
2385 mobj = re.match(self._VALID_URL, url)
2386 title = mobj.group('title')
2387 webpage = self._download_webpage(url, title)
2388 # We get the link to the free download page
2389 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2390 if m_download is None:
2391 raise ExtractorError(u'No free songs found')
2392
2393 download_link = m_download.group(1)
2394 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2395 webpage, re.MULTILINE|re.DOTALL).group('id')
2396
2397 download_webpage = self._download_webpage(download_link, id,
2398 'Downloading free downloads page')
2399 # We get the dictionary of the track from some javascrip code
2400 info = re.search(r'items: (.*?),$',
2401 download_webpage, re.MULTILINE).group(1)
2402 info = json.loads(info)[0]
2403 # We pick mp3-320 for now, until format selection can be easily implemented.
2404 mp3_info = info[u'downloads'][u'mp3-320']
2405 # If we try to use this url it says the link has expired
2406 initial_url = mp3_info[u'url']
2407 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2408 m_url = re.match(re_url, initial_url)
2409 #We build the url we will use to get the final track url
2410 # This url is build in Bandcamp in the script download_bunde_*.js
2411 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2412 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2413 # If we could correctly generate the .rand field the url would be
2414 #in the "download_url" key
2415 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2416
2417 track_info = {'id':id,
2418 'title' : info[u'title'],
2419 'ext' : 'mp3',
2420 'url' : final_url,
2421 'thumbnail' : info[u'thumb_url'],
2422 'uploader' : info[u'artist']
2423 }
2424
2425 return [track_info]
2426
2427 class RedTubeIE(InfoExtractor):
2428 """Information Extractor for redtube"""
2429 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2430
2431 def _real_extract(self,url):
2432 mobj = re.match(self._VALID_URL, url)
2433 if mobj is None:
2434 raise ExtractorError(u'Invalid URL: %s' % url)
2435
2436 video_id = mobj.group('id')
2437 video_extension = 'mp4'
2438 webpage = self._download_webpage(url, video_id)
2439
2440 self.report_extraction(video_id)
2441
2442 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2443 webpage, u'video URL')
2444
2445 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2446 webpage, u'title')
2447
2448 return [{
2449 'id': video_id,
2450 'url': video_url,
2451 'ext': video_extension,
2452 'title': video_title,
2453 }]
2454
2455 class InaIE(InfoExtractor):
2456 """Information Extractor for Ina.fr"""
2457 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2458
2459 def _real_extract(self,url):
2460 mobj = re.match(self._VALID_URL, url)
2461
2462 video_id = mobj.group('id')
2463 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2464 video_extension = 'mp4'
2465 webpage = self._download_webpage(mrss_url, video_id)
2466
2467 self.report_extraction(video_id)
2468
2469 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2470 webpage, u'video URL')
2471
2472 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2473 webpage, u'title')
2474
2475 return [{
2476 'id': video_id,
2477 'url': video_url,
2478 'ext': video_extension,
2479 'title': video_title,
2480 }]
2481
2482 class HowcastIE(InfoExtractor):
2483 """Information Extractor for Howcast.com"""
2484 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2485
2486 def _real_extract(self, url):
2487 mobj = re.match(self._VALID_URL, url)
2488
2489 video_id = mobj.group('id')
2490 webpage_url = 'http://www.howcast.com/videos/' + video_id
2491 webpage = self._download_webpage(webpage_url, video_id)
2492
2493 self.report_extraction(video_id)
2494
2495 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2496 webpage, u'video URL')
2497
2498 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2499 webpage, u'title')
2500
2501 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2502 webpage, u'description', fatal=False)
2503
2504 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2505 webpage, u'thumbnail', fatal=False)
2506
2507 return [{
2508 'id': video_id,
2509 'url': video_url,
2510 'ext': 'mp4',
2511 'title': video_title,
2512 'description': video_description,
2513 'thumbnail': thumbnail,
2514 }]
2515
2516 class VineIE(InfoExtractor):
2517 """Information Extractor for Vine.co"""
2518 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2519
2520 def _real_extract(self, url):
2521 mobj = re.match(self._VALID_URL, url)
2522
2523 video_id = mobj.group('id')
2524 webpage_url = 'https://vine.co/v/' + video_id
2525 webpage = self._download_webpage(webpage_url, video_id)
2526
2527 self.report_extraction(video_id)
2528
2529 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2530 webpage, u'video URL')
2531
2532 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2533 webpage, u'title')
2534
2535 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2536 webpage, u'thumbnail', fatal=False)
2537
2538 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2539 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2540
2541 return [{
2542 'id': video_id,
2543 'url': video_url,
2544 'ext': 'mp4',
2545 'title': video_title,
2546 'thumbnail': thumbnail,
2547 'uploader': uploader,
2548 }]
2549
2550 class FlickrIE(InfoExtractor):
2551 """Information Extractor for Flickr videos"""
2552 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2553
2554 def _real_extract(self, url):
2555 mobj = re.match(self._VALID_URL, url)
2556
2557 video_id = mobj.group('id')
2558 video_uploader_id = mobj.group('uploader_id')
2559 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2560 webpage = self._download_webpage(webpage_url, video_id)
2561
2562 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2563
2564 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2565 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2566
2567 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2568 first_xml, u'node_id')
2569
2570 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2571 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2572
2573 self.report_extraction(video_id)
2574
2575 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2576 if mobj is None:
2577 raise ExtractorError(u'Unable to extract video url')
2578 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2579
2580 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2581 webpage, u'video title')
2582
2583 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2584 webpage, u'description', fatal=False)
2585
2586 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2587 webpage, u'thumbnail', fatal=False)
2588
2589 return [{
2590 'id': video_id,
2591 'url': video_url,
2592 'ext': 'mp4',
2593 'title': video_title,
2594 'description': video_description,
2595 'thumbnail': thumbnail,
2596 'uploader_id': video_uploader_id,
2597 }]
2598
2599 class TeamcocoIE(InfoExtractor):
2600 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2601
2602 def _real_extract(self, url):
2603 mobj = re.match(self._VALID_URL, url)
2604 if mobj is None:
2605 raise ExtractorError(u'Invalid URL: %s' % url)
2606 url_title = mobj.group('url_title')
2607 webpage = self._download_webpage(url, url_title)
2608
2609 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2610 webpage, u'video id')
2611
2612 self.report_extraction(video_id)
2613
2614 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2615 webpage, u'title')
2616
2617 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2618 webpage, u'thumbnail', fatal=False)
2619
2620 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2621 webpage, u'description', fatal=False)
2622
2623 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2624 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2625
2626 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2627 data, u'video URL')
2628
2629 return [{
2630 'id': video_id,
2631 'url': video_url,
2632 'ext': 'mp4',
2633 'title': video_title,
2634 'thumbnail': thumbnail,
2635 'description': video_description,
2636 }]
2637
2638 class XHamsterIE(InfoExtractor):
2639 """Information Extractor for xHamster"""
2640 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2641
2642 def _real_extract(self,url):
2643 mobj = re.match(self._VALID_URL, url)
2644
2645 video_id = mobj.group('id')
2646 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2647 webpage = self._download_webpage(mrss_url, video_id)
2648
2649 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2650 if mobj is None:
2651 raise ExtractorError(u'Unable to extract media URL')
2652 if len(mobj.group('server')) == 0:
2653 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2654 else:
2655 video_url = mobj.group('server')+'/key='+mobj.group('file')
2656 video_extension = video_url.split('.')[-1]
2657
2658 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2659 webpage, u'title')
2660
2661 # Can't see the description anywhere in the UI
2662 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2663 # webpage, u'description', fatal=False)
2664 # if video_description: video_description = unescapeHTML(video_description)
2665
2666 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2667 if mobj:
2668 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2669 else:
2670 video_upload_date = None
2671 self._downloader.report_warning(u'Unable to extract upload date')
2672
2673 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2674 webpage, u'uploader id', default=u'anonymous')
2675
2676 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2677 webpage, u'thumbnail', fatal=False)
2678
2679 return [{
2680 'id': video_id,
2681 'url': video_url,
2682 'ext': video_extension,
2683 'title': video_title,
2684 # 'description': video_description,
2685 'upload_date': video_upload_date,
2686 'uploader_id': video_uploader_id,
2687 'thumbnail': video_thumbnail
2688 }]
2689
2690 class HypemIE(InfoExtractor):
2691 """Information Extractor for hypem"""
2692 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2693
2694 def _real_extract(self, url):
2695 mobj = re.match(self._VALID_URL, url)
2696 if mobj is None:
2697 raise ExtractorError(u'Invalid URL: %s' % url)
2698 track_id = mobj.group(1)
2699
2700 data = { 'ax': 1, 'ts': time.time() }
2701 data_encoded = compat_urllib_parse.urlencode(data)
2702 complete_url = url + "?" + data_encoded
2703 request = compat_urllib_request.Request(complete_url)
2704 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2705 cookie = urlh.headers.get('Set-Cookie', '')
2706
2707 self.report_extraction(track_id)
2708
2709 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2710 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2711 try:
2712 track_list = json.loads(html_tracks)
2713 track = track_list[u'tracks'][0]
2714 except ValueError:
2715 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2716
2717 key = track[u"key"]
2718 track_id = track[u"id"]
2719 artist = track[u"artist"]
2720 title = track[u"song"]
2721
2722 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2723 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2724 request.add_header('cookie', cookie)
2725 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2726 try:
2727 song_data = json.loads(song_data_json)
2728 except ValueError:
2729 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2730 final_url = song_data[u"url"]
2731
2732 return [{
2733 'id': track_id,
2734 'url': final_url,
2735 'ext': "mp3",
2736 'title': title,
2737 'artist': artist,
2738 }]
2739
2740 class Vbox7IE(InfoExtractor):
2741 """Information Extractor for Vbox7"""
2742 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2743
2744 def _real_extract(self,url):
2745 mobj = re.match(self._VALID_URL, url)
2746 if mobj is None:
2747 raise ExtractorError(u'Invalid URL: %s' % url)
2748 video_id = mobj.group(1)
2749
2750 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2751 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2752 redirect_url = urlh.geturl() + new_location
2753 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2754
2755 title = self._html_search_regex(r'<title>(.*)</title>',
2756 webpage, u'title').split('/')[0].strip()
2757
2758 ext = "flv"
2759 info_url = "http://vbox7.com/play/magare.do"
2760 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2761 info_request = compat_urllib_request.Request(info_url, data)
2762 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2763 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2764 if info_response is None:
2765 raise ExtractorError(u'Unable to extract the media url')
2766 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2767
2768 return [{
2769 'id': video_id,
2770 'url': final_url,
2771 'ext': ext,
2772 'title': title,
2773 'thumbnail': thumbnail_url,
2774 }]
2775
2776
2777 def gen_extractors():
2778 """ Return a list of an instance of every supported extractor.
2779 The order does matter; the first extractor matched is the one handling the URL.
2780 """
2781 return [
2782 YoutubePlaylistIE(),
2783 YoutubeChannelIE(),
2784 YoutubeUserIE(),
2785 YoutubeSearchIE(),
2786 YoutubeIE(),
2787 MetacafeIE(),
2788 DailymotionIE(),
2789 GoogleSearchIE(),
2790 PhotobucketIE(),
2791 YahooIE(),
2792 YahooSearchIE(),
2793 DepositFilesIE(),
2794 FacebookIE(),
2795 BlipTVIE(),
2796 BlipTVUserIE(),
2797 VimeoIE(),
2798 MyVideoIE(),
2799 ComedyCentralIE(),
2800 EscapistIE(),
2801 CollegeHumorIE(),
2802 XVideosIE(),
2803 SoundcloudSetIE(),
2804 SoundcloudIE(),
2805 InfoQIE(),
2806 MixcloudIE(),
2807 StanfordOpenClassroomIE(),
2808 MTVIE(),
2809 YoukuIE(),
2810 XNXXIE(),
2811 YouJizzIE(),
2812 PornotubeIE(),
2813 YouPornIE(),
2814 GooglePlusIE(),
2815 ArteTvIE(),
2816 NBAIE(),
2817 WorldStarHipHopIE(),
2818 JustinTVIE(),
2819 FunnyOrDieIE(),
2820 SteamIE(),
2821 UstreamIE(),
2822 RBMARadioIE(),
2823 EightTracksIE(),
2824 KeekIE(),
2825 TEDIE(),
2826 MySpassIE(),
2827 SpiegelIE(),
2828 LiveLeakIE(),
2829 ARDIE(),
2830 ZDFIE(),
2831 TumblrIE(),
2832 BandcampIE(),
2833 RedTubeIE(),
2834 InaIE(),
2835 HowcastIE(),
2836 VineIE(),
2837 FlickrIE(),
2838 TeamcocoIE(),
2839 XHamsterIE(),
2840 HypemIE(),
2841 Vbox7IE(),
2842 GametrailersIE(),
2843 StatigramIE(),
2844 GenericIE()
2845 ]
2846
2847 def get_info_extractor(ie_name):
2848 """Returns the info extractor class with the given ie_name"""
2849 return globals()[ie_name+'IE']