]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
InfoExtractors: use _download_webpage in more IEs
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
120 try:
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
146
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
154
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
158
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
162
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
168 return video_info
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
173 'url': url,
174 'ie_key': ie}
175 return video_info
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
179 'entries': entries}
180 if playlist_id:
181 video_info['id'] = playlist_id
182 if playlist_title:
183 video_info['title'] = playlist_title
184 return video_info
185
186
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
189
190 _VALID_URL = r"""^
191 (
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
202 v=
203 )
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
208 $"""
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
218 '13': '3gp',
219 '17': 'mp4',
220 '18': 'mp4',
221 '22': 'mp4',
222 '37': 'mp4',
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
224 '43': 'webm',
225 '44': 'webm',
226 '45': 'webm',
227 '46': 'webm',
228 }
229 _video_dimensions = {
230 '5': '240x400',
231 '6': '???',
232 '13': '???',
233 '17': '144x176',
234 '18': '360x640',
235 '22': '720x1280',
236 '34': '360x640',
237 '35': '480x854',
238 '37': '1080x1920',
239 '38': '3072x4096',
240 '43': '360x640',
241 '44': '480x854',
242 '45': '720x1280',
243 '46': '1080x1920',
244 }
245 IE_NAME = u'youtube'
246
247 @classmethod
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
252
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
256
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
260
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
264
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
268
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
272
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
276
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
281
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
285
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
289
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
293
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
297 try:
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
305 return sub_lang_list
306
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
310
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
312 """
313 Return tuple:
314 (error_message, sub_lang, sub)
315 """
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
318 'lang': sub_lang,
319 'name': sub_name,
320 'v': video_id,
321 'fmt': format,
322 })
323 url = 'http://www.youtube.com/api/timedtext?' + params
324 try:
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
328 if not sub:
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
331
332 def _extract_subtitle(self, video_id):
333 """
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
336 """
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
344 sub_lang = 'en'
345 else:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
349
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
351 return [subtitle]
352
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
358 subtitles = []
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
362 return subtitles
363
364 def _print_formats(self, formats):
365 print('Available formats:')
366 for x in formats:
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
368
369 def _real_initialize(self):
370 if self._downloader is None:
371 return
372
373 username = None
374 password = None
375 downloader_params = self._downloader.params
376
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
382 try:
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 if info is not None:
385 username = info[0]
386 password = info[2]
387 else:
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
391 return
392
393 # Set language
394 request = compat_urllib_request.Request(self._LANG_URL)
395 try:
396 self.report_lang()
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
400 return
401
402 # No authentication to be performed
403 if username is None:
404 return
405
406 request = compat_urllib_request.Request(self._LOGIN_URL)
407 try:
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 return
412
413 galx = None
414 dsh = None
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
416 if match:
417 galx = match.group(1)
418
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
420 if match:
421 dsh = match.group(1)
422
423 # Log in
424 login_form_strs = {
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
426 u'Email': username,
427 u'GALX': galx,
428 u'Passwd': password,
429 u'PersistentCookie': u'yes',
430 u'_utf8': u'霱',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
434 u'dnConn': u'',
435 u'dsh': dsh,
436 u'pstMsg': u'0',
437 u'rmShown': u'1',
438 u'secTok': u'',
439 u'signIn': u'Sign in',
440 u'timeStmp': u'',
441 u'service': u'youtube',
442 u'uilel': u'3',
443 u'hl': u'en_US',
444 }
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
446 # chokes on unicode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
450 try:
451 self.report_login()
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
455 return
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
458 return
459
460 # Confirm age
461 age_form = {
462 'next_url': '/',
463 'action_confirm': 'Confirm',
464 }
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
466 try:
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
471 return
472
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
475 if mobj is None:
476 self._downloader.report_error(u'invalid URL: %s' % url)
477 return
478 video_id = mobj.group(2)
479 return video_id
480
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
484 if mobj:
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
487
488 # Get video webpage
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
492 try:
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
496 return
497
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
499
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
502 if mobj is not None:
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 else:
505 player_url = None
506
507 # Get video info
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
513 note=False,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
517 break
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
521 else:
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
523 return
524
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
528 return
529
530 # Start extracting information
531 self.report_information_extraction(video_id)
532
533 # uploader
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
536 return
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
538
539 # uploader_id
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 if mobj is not None:
543 video_uploader_id = mobj.group(1)
544 else:
545 self._downloader.report_warning(u'unable to extract uploader nickname')
546
547 # title
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
550 return
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552
553 # thumbnail image
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
556 video_thumbnail = ''
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
559
560 # upload date
561 upload_date = None
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
563 if mobj is not None:
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
566
567 # description
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
571 else:
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
573 if fd_mobj:
574 video_description = unescapeHTML(fd_mobj.group(1))
575 else:
576 video_description = u''
577
578 # subtitles
579 video_subtitles = None
580
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
583 if video_subtitles:
584 (sub_error, sub_lang, sub) = video_subtitles[0]
585 if sub_error:
586 self._downloader.report_error(sub_error)
587
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
592 if sub_error:
593 self._downloader.report_error(sub_error)
594
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
597 return
598
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = ''
602 else:
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604
605 # token
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
607
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
610
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
619
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
624 else:
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
631 return
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
638 else:
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
644 if rf in url_map:
645 video_url_list = [(rf, url_map[rf])]
646 break
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
649 else:
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
651
652 results = []
653 for format_param, video_real_url in video_url_list:
654 # Extension
655 video_extension = self._video_extensions.get(format_param, 'flv')
656
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
659
660 results.append({
661 'id': video_id,
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
674 })
675 return results
676
677
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
680
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
685
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
689
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
693 try:
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
698 return
699
700 # Confirm age
701 disclaimer_form = {
702 'filters': '0',
703 'submit': "Continue - I'm over 18",
704 }
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706 try:
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
711 return
712
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
716 if mobj is None:
717 self._downloader.report_error(u'invalid URL: %s' % url)
718 return
719
720 video_id = mobj.group(1)
721
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
726
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
729
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
733 if mobj is not None:
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
736
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
739 if mobj is None:
740 video_url = mediaURL
741 else:
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
744 else:
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
746 if mobj is None:
747 self._downloader.report_error(u'unable to extract media URL')
748 return
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
752 return
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
754 if mobj is None:
755 self._downloader.report_error(u'unable to extract media URL')
756 return
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
760
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
762 if mobj is None:
763 self._downloader.report_error(u'unable to extract title')
764 return
765 video_title = mobj.group(1).decode('utf-8')
766
767 mobj = re.search(r'submitter=(.*?);', webpage)
768 if mobj is None:
769 self._downloader.report_error(u'unable to extract uploader nickname')
770 return
771 video_uploader = mobj.group(1)
772
773 return [{
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
777 'upload_date': None,
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
780 }]
781
782
783 class DailymotionIE(InfoExtractor):
784 """Information Extractor for Dailymotion"""
785
786 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787 IE_NAME = u'dailymotion'
788
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
792 if mobj is None:
793 self._downloader.report_error(u'invalid URL: %s' % url)
794 return
795
796 video_id = mobj.group(1).split('_')[0].split('?')[0]
797
798 video_extension = 'mp4'
799
800 # Retrieve video webpage to extract further information
801 request = compat_urllib_request.Request(url)
802 request.add_header('Cookie', 'family_filter=off')
803 webpage = self._download_webpage(request, video_id)
804
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 if mobj is None:
809 self._downloader.report_error(u'unable to extract media URL')
810 return
811 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812
813 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
814 if key in flashvars:
815 max_quality = key
816 self.to_screen(u'Using %s' % key)
817 break
818 else:
819 self._downloader.report_error(u'unable to extract video URL')
820 return
821
822 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 if mobj is None:
824 self._downloader.report_error(u'unable to extract video URL')
825 return
826
827 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828
829 # TODO: support choosing qualities
830
831 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 if mobj is None:
833 self._downloader.report_error(u'unable to extract title')
834 return
835 video_title = unescapeHTML(mobj.group('title'))
836
837 video_uploader = None
838 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 if mobj is None:
840 # lookin for official user
841 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
842 if mobj_official is None:
843 self._downloader.report_warning(u'unable to extract uploader nickname')
844 else:
845 video_uploader = mobj_official.group(1)
846 else:
847 video_uploader = mobj.group(1)
848
849 video_upload_date = None
850 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 if mobj is not None:
852 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
853
854 return [{
855 'id': video_id,
856 'url': video_url,
857 'uploader': video_uploader,
858 'upload_date': video_upload_date,
859 'title': video_title,
860 'ext': video_extension,
861 }]
862
863
864 class PhotobucketIE(InfoExtractor):
865 """Information extractor for photobucket.com."""
866
867 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
868 IE_NAME = u'photobucket'
869
870 def _real_extract(self, url):
871 # Extract id from URL
872 mobj = re.match(self._VALID_URL, url)
873 if mobj is None:
874 self._downloader.report_error(u'Invalid URL: %s' % url)
875 return
876
877 video_id = mobj.group(1)
878
879 video_extension = 'flv'
880
881 # Retrieve video webpage to extract further information
882 request = compat_urllib_request.Request(url)
883 try:
884 self.report_download_webpage(video_id)
885 webpage = compat_urllib_request.urlopen(request).read()
886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
887 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
888 return
889
890 # Extract URL, uploader, and title from webpage
891 self.report_extraction(video_id)
892 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
893 if mobj is None:
894 self._downloader.report_error(u'unable to extract media URL')
895 return
896 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
897
898 video_url = mediaURL
899
900 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
901 if mobj is None:
902 self._downloader.report_error(u'unable to extract title')
903 return
904 video_title = mobj.group(1).decode('utf-8')
905
906 video_uploader = mobj.group(2).decode('utf-8')
907
908 return [{
909 'id': video_id.decode('utf-8'),
910 'url': video_url.decode('utf-8'),
911 'uploader': video_uploader,
912 'upload_date': None,
913 'title': video_title,
914 'ext': video_extension.decode('utf-8'),
915 }]
916
917
918 class YahooIE(InfoExtractor):
919 """Information extractor for video.yahoo.com."""
920
921 _WORKING = False
922 # _VALID_URL matches all Yahoo! Video URLs
923 # _VPAGE_URL matches only the extractable '/watch/' URLs
924 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
925 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
926 IE_NAME = u'video.yahoo'
927
928 def _real_extract(self, url, new_video=True):
929 # Extract ID from URL
930 mobj = re.match(self._VALID_URL, url)
931 if mobj is None:
932 self._downloader.report_error(u'Invalid URL: %s' % url)
933 return
934
935 video_id = mobj.group(2)
936 video_extension = 'flv'
937
938 # Rewrite valid but non-extractable URLs as
939 # extractable English language /watch/ URLs
940 if re.match(self._VPAGE_URL, url) is None:
941 request = compat_urllib_request.Request(url)
942 try:
943 webpage = compat_urllib_request.urlopen(request).read()
944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
945 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
946 return
947
948 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
949 if mobj is None:
950 self._downloader.report_error(u'Unable to extract id field')
951 return
952 yahoo_id = mobj.group(1)
953
954 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
955 if mobj is None:
956 self._downloader.report_error(u'Unable to extract vid field')
957 return
958 yahoo_vid = mobj.group(1)
959
960 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
961 return self._real_extract(url, new_video=False)
962
963 # Retrieve video webpage to extract further information
964 request = compat_urllib_request.Request(url)
965 try:
966 self.report_download_webpage(video_id)
967 webpage = compat_urllib_request.urlopen(request).read()
968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
969 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
970 return
971
972 # Extract uploader and title from webpage
973 self.report_extraction(video_id)
974 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
975 if mobj is None:
976 self._downloader.report_error(u'unable to extract video title')
977 return
978 video_title = mobj.group(1).decode('utf-8')
979
980 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
981 if mobj is None:
982 self._downloader.report_error(u'unable to extract video uploader')
983 return
984 video_uploader = mobj.group(1).decode('utf-8')
985
986 # Extract video thumbnail
987 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
988 if mobj is None:
989 self._downloader.report_error(u'unable to extract video thumbnail')
990 return
991 video_thumbnail = mobj.group(1).decode('utf-8')
992
993 # Extract video description
994 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
995 if mobj is None:
996 self._downloader.report_error(u'unable to extract video description')
997 return
998 video_description = mobj.group(1).decode('utf-8')
999 if not video_description:
1000 video_description = 'No description available.'
1001
1002 # Extract video height and width
1003 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1004 if mobj is None:
1005 self._downloader.report_error(u'unable to extract video height')
1006 return
1007 yv_video_height = mobj.group(1)
1008
1009 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1010 if mobj is None:
1011 self._downloader.report_error(u'unable to extract video width')
1012 return
1013 yv_video_width = mobj.group(1)
1014
1015 # Retrieve video playlist to extract media URL
1016 # I'm not completely sure what all these options are, but we
1017 # seem to need most of them, otherwise the server sends a 401.
1018 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1019 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1020 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1021 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1022 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1023 try:
1024 self.report_download_webpage(video_id)
1025 webpage = compat_urllib_request.urlopen(request).read()
1026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1028 return
1029
1030 # Extract media URL from playlist XML
1031 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1032 if mobj is None:
1033 self._downloader.report_error(u'Unable to extract media URL')
1034 return
1035 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1036 video_url = unescapeHTML(video_url)
1037
1038 return [{
1039 'id': video_id.decode('utf-8'),
1040 'url': video_url,
1041 'uploader': video_uploader,
1042 'upload_date': None,
1043 'title': video_title,
1044 'ext': video_extension.decode('utf-8'),
1045 'thumbnail': video_thumbnail.decode('utf-8'),
1046 'description': video_description,
1047 }]
1048
1049
1050 class VimeoIE(InfoExtractor):
1051 """Information extractor for vimeo.com."""
1052
1053 # _VALID_URL matches Vimeo URLs
1054 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1055 IE_NAME = u'vimeo'
1056
1057 def _real_extract(self, url, new_video=True):
1058 # Extract ID from URL
1059 mobj = re.match(self._VALID_URL, url)
1060 if mobj is None:
1061 self._downloader.report_error(u'Invalid URL: %s' % url)
1062 return
1063
1064 video_id = mobj.group('id')
1065 if not mobj.group('proto'):
1066 url = 'https://' + url
1067 if mobj.group('direct_link'):
1068 url = 'https://vimeo.com/' + video_id
1069
1070 # Retrieve video webpage to extract further information
1071 request = compat_urllib_request.Request(url, None, std_headers)
1072 webpage = self._download_webpage(request, video_id)
1073
1074 # Now we begin extracting as much information as we can from what we
1075 # retrieved. First we extract the information common to all extractors,
1076 # and latter we extract those that are Vimeo specific.
1077 self.report_extraction(video_id)
1078
1079 # Extract the config JSON
1080 try:
1081 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1082 config = json.loads(config)
1083 except:
1084 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1085 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1086 else:
1087 self._downloader.report_error(u'unable to extract info section')
1088 return
1089
1090 # Extract title
1091 video_title = config["video"]["title"]
1092
1093 # Extract uploader and uploader_id
1094 video_uploader = config["video"]["owner"]["name"]
1095 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1096
1097 # Extract video thumbnail
1098 video_thumbnail = config["video"]["thumbnail"]
1099
1100 # Extract video description
1101 video_description = get_element_by_attribute("itemprop", "description", webpage)
1102 if video_description: video_description = clean_html(video_description)
1103 else: video_description = u''
1104
1105 # Extract upload date
1106 video_upload_date = None
1107 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1108 if mobj is not None:
1109 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1110
1111 # Vimeo specific: extract request signature and timestamp
1112 sig = config['request']['signature']
1113 timestamp = config['request']['timestamp']
1114
1115 # Vimeo specific: extract video codec and quality information
1116 # First consider quality, then codecs, then take everything
1117 # TODO bind to format param
1118 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1119 files = { 'hd': [], 'sd': [], 'other': []}
1120 for codec_name, codec_extension in codecs:
1121 if codec_name in config["video"]["files"]:
1122 if 'hd' in config["video"]["files"][codec_name]:
1123 files['hd'].append((codec_name, codec_extension, 'hd'))
1124 elif 'sd' in config["video"]["files"][codec_name]:
1125 files['sd'].append((codec_name, codec_extension, 'sd'))
1126 else:
1127 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1128
1129 for quality in ('hd', 'sd', 'other'):
1130 if len(files[quality]) > 0:
1131 video_quality = files[quality][0][2]
1132 video_codec = files[quality][0][0]
1133 video_extension = files[quality][0][1]
1134 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1135 break
1136 else:
1137 self._downloader.report_error(u'no known codec found')
1138 return
1139
1140 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1141 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1142
1143 return [{
1144 'id': video_id,
1145 'url': video_url,
1146 'uploader': video_uploader,
1147 'uploader_id': video_uploader_id,
1148 'upload_date': video_upload_date,
1149 'title': video_title,
1150 'ext': video_extension,
1151 'thumbnail': video_thumbnail,
1152 'description': video_description,
1153 }]
1154
1155
1156 class ArteTvIE(InfoExtractor):
1157 """arte.tv information extractor."""
1158
1159 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1160 _LIVE_URL = r'index-[0-9]+\.html$'
1161
1162 IE_NAME = u'arte.tv'
1163
1164 def fetch_webpage(self, url):
1165 request = compat_urllib_request.Request(url)
1166 try:
1167 self.report_download_webpage(url)
1168 webpage = compat_urllib_request.urlopen(request).read()
1169 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1170 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1171 return
1172 except ValueError as err:
1173 self._downloader.report_error(u'Invalid URL: %s' % url)
1174 return
1175 return webpage
1176
1177 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1178 page = self.fetch_webpage(url)
1179 mobj = re.search(regex, page, regexFlags)
1180 info = {}
1181
1182 if mobj is None:
1183 self._downloader.report_error(u'Invalid URL: %s' % url)
1184 return
1185
1186 for (i, key, err) in matchTuples:
1187 if mobj.group(i) is None:
1188 self._downloader.report_error(err)
1189 return
1190 else:
1191 info[key] = mobj.group(i)
1192
1193 return info
1194
1195 def extractLiveStream(self, url):
1196 video_lang = url.split('/')[-4]
1197 info = self.grep_webpage(
1198 url,
1199 r'src="(.*?/videothek_js.*?\.js)',
1200 0,
1201 [
1202 (1, 'url', u'Invalid URL: %s' % url)
1203 ]
1204 )
1205 http_host = url.split('/')[2]
1206 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1207 info = self.grep_webpage(
1208 next_url,
1209 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1210 '(http://.*?\.swf).*?' +
1211 '(rtmp://.*?)\'',
1212 re.DOTALL,
1213 [
1214 (1, 'path', u'could not extract video path: %s' % url),
1215 (2, 'player', u'could not extract video player: %s' % url),
1216 (3, 'url', u'could not extract video url: %s' % url)
1217 ]
1218 )
1219 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1220
1221 def extractPlus7Stream(self, url):
1222 video_lang = url.split('/')[-3]
1223 info = self.grep_webpage(
1224 url,
1225 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1226 0,
1227 [
1228 (1, 'url', u'Invalid URL: %s' % url)
1229 ]
1230 )
1231 next_url = compat_urllib_parse.unquote(info.get('url'))
1232 info = self.grep_webpage(
1233 next_url,
1234 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1235 0,
1236 [
1237 (1, 'url', u'Could not find <video> tag: %s' % url)
1238 ]
1239 )
1240 next_url = compat_urllib_parse.unquote(info.get('url'))
1241
1242 info = self.grep_webpage(
1243 next_url,
1244 r'<video id="(.*?)".*?>.*?' +
1245 '<name>(.*?)</name>.*?' +
1246 '<dateVideo>(.*?)</dateVideo>.*?' +
1247 '<url quality="hd">(.*?)</url>',
1248 re.DOTALL,
1249 [
1250 (1, 'id', u'could not extract video id: %s' % url),
1251 (2, 'title', u'could not extract video title: %s' % url),
1252 (3, 'date', u'could not extract video date: %s' % url),
1253 (4, 'url', u'could not extract video url: %s' % url)
1254 ]
1255 )
1256
1257 return {
1258 'id': info.get('id'),
1259 'url': compat_urllib_parse.unquote(info.get('url')),
1260 'uploader': u'arte.tv',
1261 'upload_date': info.get('date'),
1262 'title': info.get('title').decode('utf-8'),
1263 'ext': u'mp4',
1264 'format': u'NA',
1265 'player_url': None,
1266 }
1267
1268 def _real_extract(self, url):
1269 video_id = url.split('/')[-1]
1270 self.report_extraction(video_id)
1271
1272 if re.search(self._LIVE_URL, video_id) is not None:
1273 self.extractLiveStream(url)
1274 return
1275 else:
1276 info = self.extractPlus7Stream(url)
1277
1278 return [info]
1279
1280
1281 class GenericIE(InfoExtractor):
1282 """Generic last-resort information extractor."""
1283
1284 _VALID_URL = r'.*'
1285 IE_NAME = u'generic'
1286
1287 def report_download_webpage(self, video_id):
1288 """Report webpage download."""
1289 if not self._downloader.params.get('test', False):
1290 self._downloader.report_warning(u'Falling back on generic information extractor.')
1291 super(GenericIE, self).report_download_webpage(video_id)
1292
1293 def report_following_redirect(self, new_url):
1294 """Report information extraction."""
1295 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1296
1297 def _test_redirect(self, url):
1298 """Check if it is a redirect, like url shorteners, in case return the new url."""
1299 class HeadRequest(compat_urllib_request.Request):
1300 def get_method(self):
1301 return "HEAD"
1302
1303 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1304 """
1305 Subclass the HTTPRedirectHandler to make it use our
1306 HeadRequest also on the redirected URL
1307 """
1308 def redirect_request(self, req, fp, code, msg, headers, newurl):
1309 if code in (301, 302, 303, 307):
1310 newurl = newurl.replace(' ', '%20')
1311 newheaders = dict((k,v) for k,v in req.headers.items()
1312 if k.lower() not in ("content-length", "content-type"))
1313 return HeadRequest(newurl,
1314 headers=newheaders,
1315 origin_req_host=req.get_origin_req_host(),
1316 unverifiable=True)
1317 else:
1318 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1319
1320 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1321 """
1322 Fallback to GET if HEAD is not allowed (405 HTTP error)
1323 """
1324 def http_error_405(self, req, fp, code, msg, headers):
1325 fp.read()
1326 fp.close()
1327
1328 newheaders = dict((k,v) for k,v in req.headers.items()
1329 if k.lower() not in ("content-length", "content-type"))
1330 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1331 headers=newheaders,
1332 origin_req_host=req.get_origin_req_host(),
1333 unverifiable=True))
1334
1335 # Build our opener
1336 opener = compat_urllib_request.OpenerDirector()
1337 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1338 HTTPMethodFallback, HEADRedirectHandler,
1339 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1340 opener.add_handler(handler())
1341
1342 response = opener.open(HeadRequest(url))
1343 new_url = response.geturl()
1344
1345 if url == new_url:
1346 return False
1347
1348 self.report_following_redirect(new_url)
1349 return new_url
1350
1351 def _real_extract(self, url):
1352 new_url = self._test_redirect(url)
1353 if new_url: return [self.url_result(new_url)]
1354
1355 video_id = url.split('/')[-1]
1356 try:
1357 webpage = self._download_webpage(url, video_id)
1358 except ValueError as err:
1359 # since this is the last-resort InfoExtractor, if
1360 # this error is thrown, it'll be thrown here
1361 self._downloader.report_error(u'Invalid URL: %s' % url)
1362 return
1363
1364 self.report_extraction(video_id)
1365 # Start with something easy: JW Player in SWFObject
1366 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 if mobj is None:
1368 # Broaden the search a little bit
1369 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 if mobj is None:
1371 # Broaden the search a little bit: JWPlayer JS loader
1372 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373 if mobj is None:
1374 self._downloader.report_error(u'Invalid URL: %s' % url)
1375 return
1376
1377 # It's possible that one of the regexes
1378 # matched, but returned an empty group:
1379 if mobj.group(1) is None:
1380 self._downloader.report_error(u'Invalid URL: %s' % url)
1381 return
1382
1383 video_url = compat_urllib_parse.unquote(mobj.group(1))
1384 video_id = os.path.basename(video_url)
1385
1386 # here's a fun little line of code for you:
1387 video_extension = os.path.splitext(video_id)[1][1:]
1388 video_id = os.path.splitext(video_id)[0]
1389
1390 # it's tempting to parse this further, but you would
1391 # have to take into account all the variations like
1392 # Video Title - Site Name
1393 # Site Name | Video Title
1394 # Video Title - Tagline | Site Name
1395 # and so on and so forth; it's just not practical
1396 mobj = re.search(r'<title>(.*)</title>', webpage)
1397 if mobj is None:
1398 self._downloader.report_error(u'unable to extract title')
1399 return
1400 video_title = mobj.group(1)
1401
1402 # video uploader is domain name
1403 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404 if mobj is None:
1405 self._downloader.report_error(u'unable to extract title')
1406 return
1407 video_uploader = mobj.group(1)
1408
1409 return [{
1410 'id': video_id,
1411 'url': video_url,
1412 'uploader': video_uploader,
1413 'upload_date': None,
1414 'title': video_title,
1415 'ext': video_extension,
1416 }]
1417
1418
1419 class YoutubeSearchIE(InfoExtractor):
1420 """Information Extractor for YouTube search queries."""
1421 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1422 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1423 _max_youtube_results = 1000
1424 IE_NAME = u'youtube:search'
1425
1426 def report_download_page(self, query, pagenum):
1427 """Report attempt to download search page with given number."""
1428 query = query.decode(preferredencoding())
1429 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1430
1431 def _real_extract(self, query):
1432 mobj = re.match(self._VALID_URL, query)
1433 if mobj is None:
1434 self._downloader.report_error(u'invalid search query "%s"' % query)
1435 return
1436
1437 prefix, query = query.split(':')
1438 prefix = prefix[8:]
1439 query = query.encode('utf-8')
1440 if prefix == '':
1441 return self._get_n_results(query, 1)
1442 elif prefix == 'all':
1443 self._get_n_results(query, self._max_youtube_results)
1444 else:
1445 try:
1446 n = int(prefix)
1447 if n <= 0:
1448 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1449 return
1450 elif n > self._max_youtube_results:
1451 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1452 n = self._max_youtube_results
1453 return self._get_n_results(query, n)
1454 except ValueError: # parsing prefix as integer fails
1455 return self._get_n_results(query, 1)
1456
1457 def _get_n_results(self, query, n):
1458 """Get a specified number of results for a query"""
1459
1460 video_ids = []
1461 pagenum = 0
1462 limit = n
1463
1464 while (50 * pagenum) < limit:
1465 self.report_download_page(query, pagenum+1)
1466 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467 request = compat_urllib_request.Request(result_url)
1468 try:
1469 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1472 return
1473 api_response = json.loads(data)['data']
1474
1475 if not 'items' in api_response:
1476 self._downloader.report_error(u'[youtube] No video results')
1477 return
1478
1479 new_ids = list(video['id'] for video in api_response['items'])
1480 video_ids += new_ids
1481
1482 limit = min(n, api_response['totalItems'])
1483 pagenum += 1
1484
1485 if len(video_ids) > n:
1486 video_ids = video_ids[:n]
1487 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1488 return videos
1489
1490
1491 class GoogleSearchIE(InfoExtractor):
1492 """Information Extractor for Google Video search queries."""
1493 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497 _max_google_results = 1000
1498 IE_NAME = u'video.google:search'
1499
1500 def report_download_page(self, query, pagenum):
1501 """Report attempt to download playlist page with given number."""
1502 query = query.decode(preferredencoding())
1503 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1504
1505 def _real_extract(self, query):
1506 mobj = re.match(self._VALID_URL, query)
1507 if mobj is None:
1508 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 return
1510
1511 prefix, query = query.split(':')
1512 prefix = prefix[8:]
1513 query = query.encode('utf-8')
1514 if prefix == '':
1515 self._download_n_results(query, 1)
1516 return
1517 elif prefix == 'all':
1518 self._download_n_results(query, self._max_google_results)
1519 return
1520 else:
1521 try:
1522 n = int(prefix)
1523 if n <= 0:
1524 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1525 return
1526 elif n > self._max_google_results:
1527 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1528 n = self._max_google_results
1529 self._download_n_results(query, n)
1530 return
1531 except ValueError: # parsing prefix as integer fails
1532 self._download_n_results(query, 1)
1533 return
1534
1535 def _download_n_results(self, query, n):
1536 """Downloads a specified number of results for a query"""
1537
1538 video_ids = []
1539 pagenum = 0
1540
1541 while True:
1542 self.report_download_page(query, pagenum)
1543 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1544 request = compat_urllib_request.Request(result_url)
1545 try:
1546 page = compat_urllib_request.urlopen(request).read()
1547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1549 return
1550
1551 # Extract video identifiers
1552 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1553 video_id = mobj.group(1)
1554 if video_id not in video_ids:
1555 video_ids.append(video_id)
1556 if len(video_ids) == n:
1557 # Specified n videos reached
1558 for id in video_ids:
1559 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1560 return
1561
1562 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1563 for id in video_ids:
1564 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1565 return
1566
1567 pagenum = pagenum + 1
1568
1569
1570 class YahooSearchIE(InfoExtractor):
1571 """Information Extractor for Yahoo! Video search queries."""
1572
1573 _WORKING = False
1574 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1575 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1576 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1577 _MORE_PAGES_INDICATOR = r'\s*Next'
1578 _max_yahoo_results = 1000
1579 IE_NAME = u'video.yahoo:search'
1580
1581 def report_download_page(self, query, pagenum):
1582 """Report attempt to download playlist page with given number."""
1583 query = query.decode(preferredencoding())
1584 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1585
1586 def _real_extract(self, query):
1587 mobj = re.match(self._VALID_URL, query)
1588 if mobj is None:
1589 self._downloader.report_error(u'invalid search query "%s"' % query)
1590 return
1591
1592 prefix, query = query.split(':')
1593 prefix = prefix[8:]
1594 query = query.encode('utf-8')
1595 if prefix == '':
1596 self._download_n_results(query, 1)
1597 return
1598 elif prefix == 'all':
1599 self._download_n_results(query, self._max_yahoo_results)
1600 return
1601 else:
1602 try:
1603 n = int(prefix)
1604 if n <= 0:
1605 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 return
1607 elif n > self._max_yahoo_results:
1608 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1609 n = self._max_yahoo_results
1610 self._download_n_results(query, n)
1611 return
1612 except ValueError: # parsing prefix as integer fails
1613 self._download_n_results(query, 1)
1614 return
1615
1616 def _download_n_results(self, query, n):
1617 """Downloads a specified number of results for a query"""
1618
1619 video_ids = []
1620 already_seen = set()
1621 pagenum = 1
1622
1623 while True:
1624 self.report_download_page(query, pagenum)
1625 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1626 request = compat_urllib_request.Request(result_url)
1627 try:
1628 page = compat_urllib_request.urlopen(request).read()
1629 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631 return
1632
1633 # Extract video identifiers
1634 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1635 video_id = mobj.group(1)
1636 if video_id not in already_seen:
1637 video_ids.append(video_id)
1638 already_seen.add(video_id)
1639 if len(video_ids) == n:
1640 # Specified n videos reached
1641 for id in video_ids:
1642 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1643 return
1644
1645 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1646 for id in video_ids:
1647 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1648 return
1649
1650 pagenum = pagenum + 1
1651
1652
1653 class YoutubePlaylistIE(InfoExtractor):
1654 """Information Extractor for YouTube playlists."""
1655
1656 _VALID_URL = r"""(?:
1657 (?:https?://)?
1658 (?:\w+\.)?
1659 youtube\.com/
1660 (?:
1661 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1662 \? (?:.*?&)*? (?:p|a|list)=
1663 | p/
1664 )
1665 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1666 .*
1667 |
1668 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1669 )"""
1670 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1671 _MAX_RESULTS = 50
1672 IE_NAME = u'youtube:playlist'
1673
1674 @classmethod
1675 def suitable(cls, url):
1676 """Receives a URL and returns True if suitable for this IE."""
1677 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1678
1679 def _real_extract(self, url):
1680 # Extract playlist id
1681 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1682 if mobj is None:
1683 self._downloader.report_error(u'invalid url: %s' % url)
1684 return
1685
1686 # Download playlist videos from API
1687 playlist_id = mobj.group(1) or mobj.group(2)
1688 page_num = 1
1689 videos = []
1690
1691 while True:
1692 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1693 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1694
1695 try:
1696 response = json.loads(page)
1697 except ValueError as err:
1698 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1699 return
1700
1701 if 'feed' not in response:
1702 self._downloader.report_error(u'Got a malformed response from YouTube API')
1703 return
1704 playlist_title = response['feed']['title']['$t']
1705 if 'entry' not in response['feed']:
1706 # Number of videos is a multiple of self._MAX_RESULTS
1707 break
1708
1709 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1710 for entry in response['feed']['entry']
1711 if 'content' in entry ]
1712
1713 if len(response['feed']['entry']) < self._MAX_RESULTS:
1714 break
1715 page_num += 1
1716
1717 videos = [v[1] for v in sorted(videos)]
1718
1719 url_results = [self.url_result(url, 'Youtube') for url in videos]
1720 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1721
1722
1723 class YoutubeChannelIE(InfoExtractor):
1724 """Information Extractor for YouTube channels."""
1725
1726 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1727 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1728 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1729 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1730 IE_NAME = u'youtube:channel'
1731
1732 def extract_videos_from_page(self, page):
1733 ids_in_page = []
1734 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1735 if mobj.group(1) not in ids_in_page:
1736 ids_in_page.append(mobj.group(1))
1737 return ids_in_page
1738
1739 def _real_extract(self, url):
1740 # Extract channel id
1741 mobj = re.match(self._VALID_URL, url)
1742 if mobj is None:
1743 self._downloader.report_error(u'invalid url: %s' % url)
1744 return
1745
1746 # Download channel page
1747 channel_id = mobj.group(1)
1748 video_ids = []
1749 pagenum = 1
1750
1751 url = self._TEMPLATE_URL % (channel_id, pagenum)
1752 page = self._download_webpage(url, channel_id,
1753 u'Downloading page #%s' % pagenum)
1754
1755 # Extract video identifiers
1756 ids_in_page = self.extract_videos_from_page(page)
1757 video_ids.extend(ids_in_page)
1758
1759 # Download any subsequent channel pages using the json-based channel_ajax query
1760 if self._MORE_PAGES_INDICATOR in page:
1761 while True:
1762 pagenum = pagenum + 1
1763
1764 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1765 page = self._download_webpage(url, channel_id,
1766 u'Downloading page #%s' % pagenum)
1767
1768 page = json.loads(page)
1769
1770 ids_in_page = self.extract_videos_from_page(page['content_html'])
1771 video_ids.extend(ids_in_page)
1772
1773 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1774 break
1775
1776 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1777
1778 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1779 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1780 return [self.playlist_result(url_entries, channel_id)]
1781
1782
1783 class YoutubeUserIE(InfoExtractor):
1784 """Information Extractor for YouTube users."""
1785
1786 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1787 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1788 _GDATA_PAGE_SIZE = 50
1789 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1790 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1791 IE_NAME = u'youtube:user'
1792
1793 def _real_extract(self, url):
1794 # Extract username
1795 mobj = re.match(self._VALID_URL, url)
1796 if mobj is None:
1797 self._downloader.report_error(u'invalid url: %s' % url)
1798 return
1799
1800 username = mobj.group(1)
1801
1802 # Download video ids using YouTube Data API. Result size per
1803 # query is limited (currently to 50 videos) so we need to query
1804 # page by page until there are no video ids - it means we got
1805 # all of them.
1806
1807 video_ids = []
1808 pagenum = 0
1809
1810 while True:
1811 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1812
1813 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1814 page = self._download_webpage(gdata_url, username,
1815 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1816
1817 # Extract video identifiers
1818 ids_in_page = []
1819
1820 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1821 if mobj.group(1) not in ids_in_page:
1822 ids_in_page.append(mobj.group(1))
1823
1824 video_ids.extend(ids_in_page)
1825
1826 # A little optimization - if current page is not
1827 # "full", ie. does not contain PAGE_SIZE video ids then
1828 # we can assume that this page is the last one - there
1829 # are no more ids on further pages - no need to query
1830 # again.
1831
1832 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1833 break
1834
1835 pagenum += 1
1836
1837 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1838 url_results = [self.url_result(url, 'Youtube') for url in urls]
1839 return [self.playlist_result(url_results, playlist_title = username)]
1840
1841
1842 class BlipTVUserIE(InfoExtractor):
1843 """Information Extractor for blip.tv users."""
1844
1845 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1846 _PAGE_SIZE = 12
1847 IE_NAME = u'blip.tv:user'
1848
1849 def _real_extract(self, url):
1850 # Extract username
1851 mobj = re.match(self._VALID_URL, url)
1852 if mobj is None:
1853 self._downloader.report_error(u'invalid url: %s' % url)
1854 return
1855
1856 username = mobj.group(1)
1857
1858 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1859
1860 page = self._download_webpage(url, username, u'Downloading user page')
1861 mobj = re.search(r'data-users-id="([^"]+)"', page)
1862 page_base = page_base % mobj.group(1)
1863
1864
1865 # Download video ids using BlipTV Ajax calls. Result size per
1866 # query is limited (currently to 12 videos) so we need to query
1867 # page by page until there are no video ids - it means we got
1868 # all of them.
1869
1870 video_ids = []
1871 pagenum = 1
1872
1873 while True:
1874 url = page_base + "&page=" + str(pagenum)
1875 page = self._download_webpage(url, username,
1876 u'Downloading video ids from page %d' % pagenum)
1877
1878 # Extract video identifiers
1879 ids_in_page = []
1880
1881 for mobj in re.finditer(r'href="/([^"]+)"', page):
1882 if mobj.group(1) not in ids_in_page:
1883 ids_in_page.append(unescapeHTML(mobj.group(1)))
1884
1885 video_ids.extend(ids_in_page)
1886
1887 # A little optimization - if current page is not
1888 # "full", ie. does not contain PAGE_SIZE video ids then
1889 # we can assume that this page is the last one - there
1890 # are no more ids on further pages - no need to query
1891 # again.
1892
1893 if len(ids_in_page) < self._PAGE_SIZE:
1894 break
1895
1896 pagenum += 1
1897
1898 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1899 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1900 return [self.playlist_result(url_entries, playlist_title = username)]
1901
1902
1903 class DepositFilesIE(InfoExtractor):
1904 """Information extractor for depositfiles.com"""
1905
1906 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1907
1908 def _real_extract(self, url):
1909 file_id = url.split('/')[-1]
1910 # Rebuild url in english locale
1911 url = 'http://depositfiles.com/en/files/' + file_id
1912
1913 # Retrieve file webpage with 'Free download' button pressed
1914 free_download_indication = { 'gateway_result' : '1' }
1915 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1916 try:
1917 self.report_download_webpage(file_id)
1918 webpage = compat_urllib_request.urlopen(request).read()
1919 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1920 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1921 return
1922
1923 # Search for the real file URL
1924 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1925 if (mobj is None) or (mobj.group(1) is None):
1926 # Try to figure out reason of the error.
1927 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1928 if (mobj is not None) and (mobj.group(1) is not None):
1929 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1930 self._downloader.report_error(u'%s' % restriction_message)
1931 else:
1932 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1933 return
1934
1935 file_url = mobj.group(1)
1936 file_extension = os.path.splitext(file_url)[1][1:]
1937
1938 # Search for file title
1939 mobj = re.search(r'<b title="(.*?)">', webpage)
1940 if mobj is None:
1941 self._downloader.report_error(u'unable to extract title')
1942 return
1943 file_title = mobj.group(1).decode('utf-8')
1944
1945 return [{
1946 'id': file_id.decode('utf-8'),
1947 'url': file_url.decode('utf-8'),
1948 'uploader': None,
1949 'upload_date': None,
1950 'title': file_title,
1951 'ext': file_extension.decode('utf-8'),
1952 }]
1953
1954
1955 class FacebookIE(InfoExtractor):
1956 """Information Extractor for Facebook"""
1957
1958 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1959 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1960 _NETRC_MACHINE = 'facebook'
1961 IE_NAME = u'facebook'
1962
1963 def report_login(self):
1964 """Report attempt to log in."""
1965 self.to_screen(u'Logging in')
1966
1967 def _real_initialize(self):
1968 if self._downloader is None:
1969 return
1970
1971 useremail = None
1972 password = None
1973 downloader_params = self._downloader.params
1974
1975 # Attempt to use provided username and password or .netrc data
1976 if downloader_params.get('username', None) is not None:
1977 useremail = downloader_params['username']
1978 password = downloader_params['password']
1979 elif downloader_params.get('usenetrc', False):
1980 try:
1981 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1982 if info is not None:
1983 useremail = info[0]
1984 password = info[2]
1985 else:
1986 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1987 except (IOError, netrc.NetrcParseError) as err:
1988 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1989 return
1990
1991 if useremail is None:
1992 return
1993
1994 # Log in
1995 login_form = {
1996 'email': useremail,
1997 'pass': password,
1998 'login': 'Log+In'
1999 }
2000 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2001 try:
2002 self.report_login()
2003 login_results = compat_urllib_request.urlopen(request).read()
2004 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2005 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2006 return
2007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2009 return
2010
2011 def _real_extract(self, url):
2012 mobj = re.match(self._VALID_URL, url)
2013 if mobj is None:
2014 self._downloader.report_error(u'invalid URL: %s' % url)
2015 return
2016 video_id = mobj.group('ID')
2017
2018 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2019 webpage = self._download_webpage(url, video_id)
2020
2021 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2022 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2023 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2024 if not m:
2025 raise ExtractorError(u'Cannot parse data')
2026 data = dict(json.loads(m.group(1)))
2027 params_raw = compat_urllib_parse.unquote(data['params'])
2028 params = json.loads(params_raw)
2029 video_data = params['video_data'][0]
2030 video_url = video_data.get('hd_src')
2031 if not video_url:
2032 video_url = video_data['sd_src']
2033 if not video_url:
2034 raise ExtractorError(u'Cannot find video URL')
2035 video_duration = int(video_data['video_duration'])
2036 thumbnail = video_data['thumbnail_src']
2037
2038 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2039 if not m:
2040 raise ExtractorError(u'Cannot find title in webpage')
2041 video_title = unescapeHTML(m.group(1))
2042
2043 info = {
2044 'id': video_id,
2045 'title': video_title,
2046 'url': video_url,
2047 'ext': 'mp4',
2048 'duration': video_duration,
2049 'thumbnail': thumbnail,
2050 }
2051 return [info]
2052
2053
2054 class BlipTVIE(InfoExtractor):
2055 """Information extractor for blip.tv"""
2056
2057 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2058 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2059 IE_NAME = u'blip.tv'
2060
2061 def report_direct_download(self, title):
2062 """Report information extraction."""
2063 self.to_screen(u'%s: Direct download detected' % title)
2064
2065 def _real_extract(self, url):
2066 mobj = re.match(self._VALID_URL, url)
2067 if mobj is None:
2068 self._downloader.report_error(u'invalid URL: %s' % url)
2069 return
2070
2071 urlp = compat_urllib_parse_urlparse(url)
2072 if urlp.path.startswith('/play/'):
2073 request = compat_urllib_request.Request(url)
2074 response = compat_urllib_request.urlopen(request)
2075 redirecturl = response.geturl()
2076 rurlp = compat_urllib_parse_urlparse(redirecturl)
2077 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2078 url = 'http://blip.tv/a/a-' + file_id
2079 return self._real_extract(url)
2080
2081
2082 if '?' in url:
2083 cchar = '&'
2084 else:
2085 cchar = '?'
2086 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2087 request = compat_urllib_request.Request(json_url)
2088 request.add_header('User-Agent', 'iTunes/10.6.1')
2089 self.report_extraction(mobj.group(1))
2090 info = None
2091 try:
2092 urlh = compat_urllib_request.urlopen(request)
2093 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2094 basename = url.split('/')[-1]
2095 title,ext = os.path.splitext(basename)
2096 title = title.decode('UTF-8')
2097 ext = ext.replace('.', '')
2098 self.report_direct_download(title)
2099 info = {
2100 'id': title,
2101 'url': url,
2102 'uploader': None,
2103 'upload_date': None,
2104 'title': title,
2105 'ext': ext,
2106 'urlhandle': urlh
2107 }
2108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2109 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2110 if info is None: # Regular URL
2111 try:
2112 json_code_bytes = urlh.read()
2113 json_code = json_code_bytes.decode('utf-8')
2114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2115 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2116 return
2117
2118 try:
2119 json_data = json.loads(json_code)
2120 if 'Post' in json_data:
2121 data = json_data['Post']
2122 else:
2123 data = json_data
2124
2125 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2126 video_url = data['media']['url']
2127 umobj = re.match(self._URL_EXT, video_url)
2128 if umobj is None:
2129 raise ValueError('Can not determine filename extension')
2130 ext = umobj.group(1)
2131
2132 info = {
2133 'id': data['item_id'],
2134 'url': video_url,
2135 'uploader': data['display_name'],
2136 'upload_date': upload_date,
2137 'title': data['title'],
2138 'ext': ext,
2139 'format': data['media']['mimeType'],
2140 'thumbnail': data['thumbnailUrl'],
2141 'description': data['description'],
2142 'player_url': data['embedUrl'],
2143 'user_agent': 'iTunes/10.6.1',
2144 }
2145 except (ValueError,KeyError) as err:
2146 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2147 return
2148
2149 return [info]
2150
2151
2152 class MyVideoIE(InfoExtractor):
2153 """Information Extractor for myvideo.de."""
2154
2155 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2156 IE_NAME = u'myvideo'
2157
2158 def _real_extract(self,url):
2159 mobj = re.match(self._VALID_URL, url)
2160 if mobj is None:
2161 self._download.report_error(u'invalid URL: %s' % url)
2162 return
2163
2164 video_id = mobj.group(1)
2165
2166 # Get video webpage
2167 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2168 webpage = self._download_webpage(webpage_url, video_id)
2169
2170 self.report_extraction(video_id)
2171 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2172 webpage)
2173 if mobj is None:
2174 self._downloader.report_error(u'unable to extract media URL')
2175 return
2176 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2177
2178 mobj = re.search('<title>([^<]+)</title>', webpage)
2179 if mobj is None:
2180 self._downloader.report_error(u'unable to extract title')
2181 return
2182
2183 video_title = mobj.group(1)
2184
2185 return [{
2186 'id': video_id,
2187 'url': video_url,
2188 'uploader': None,
2189 'upload_date': None,
2190 'title': video_title,
2191 'ext': u'flv',
2192 }]
2193
2194 class ComedyCentralIE(InfoExtractor):
2195 """Information extractor for The Daily Show and Colbert Report """
2196
2197 # urls can be abbreviations like :thedailyshow or :colbert
2198 # urls for episodes like:
2199 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2200 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2201 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2202 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2203 |(https?://)?(www\.)?
2204 (?P<showname>thedailyshow|colbertnation)\.com/
2205 (full-episodes/(?P<episode>.*)|
2206 (?P<clip>
2207 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2208 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2209 $"""
2210
2211 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2212
2213 _video_extensions = {
2214 '3500': 'mp4',
2215 '2200': 'mp4',
2216 '1700': 'mp4',
2217 '1200': 'mp4',
2218 '750': 'mp4',
2219 '400': 'mp4',
2220 }
2221 _video_dimensions = {
2222 '3500': '1280x720',
2223 '2200': '960x540',
2224 '1700': '768x432',
2225 '1200': '640x360',
2226 '750': '512x288',
2227 '400': '384x216',
2228 }
2229
2230 @classmethod
2231 def suitable(cls, url):
2232 """Receives a URL and returns True if suitable for this IE."""
2233 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2234
2235 def _print_formats(self, formats):
2236 print('Available formats:')
2237 for x in formats:
2238 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2239
2240
2241 def _real_extract(self, url):
2242 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2243 if mobj is None:
2244 self._downloader.report_error(u'invalid URL: %s' % url)
2245 return
2246
2247 if mobj.group('shortname'):
2248 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2249 url = u'http://www.thedailyshow.com/full-episodes/'
2250 else:
2251 url = u'http://www.colbertnation.com/full-episodes/'
2252 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2253 assert mobj is not None
2254
2255 if mobj.group('clip'):
2256 if mobj.group('showname') == 'thedailyshow':
2257 epTitle = mobj.group('tdstitle')
2258 else:
2259 epTitle = mobj.group('cntitle')
2260 dlNewest = False
2261 else:
2262 dlNewest = not mobj.group('episode')
2263 if dlNewest:
2264 epTitle = mobj.group('showname')
2265 else:
2266 epTitle = mobj.group('episode')
2267
2268 self.report_extraction(epTitle)
2269 webpage = self._download_webpage(url, epTitle)
2270 if dlNewest:
2271 url = htmlHandle.geturl()
2272 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273 if mobj is None:
2274 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2275 return
2276 if mobj.group('episode') == '':
2277 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2278 return
2279 epTitle = mobj.group('episode')
2280
2281 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2282
2283 if len(mMovieParams) == 0:
2284 # The Colbert Report embeds the information in a without
2285 # a URL prefix; so extract the alternate reference
2286 # and then add the URL prefix manually.
2287
2288 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2289 if len(altMovieParams) == 0:
2290 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2291 return
2292 else:
2293 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2294
2295 uri = mMovieParams[0][1]
2296 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2297 indexXml = self._download_webpage(indexUrl, epTitle,
2298 u'Downloading show index',
2299 u'unable to download episode index')
2300
2301 results = []
2302
2303 idoc = xml.etree.ElementTree.fromstring(indexXml)
2304 itemEls = idoc.findall('.//item')
2305 for partNum,itemEl in enumerate(itemEls):
2306 mediaId = itemEl.findall('./guid')[0].text
2307 shortMediaId = mediaId.split(':')[-1]
2308 showId = mediaId.split(':')[-2].replace('.com', '')
2309 officialTitle = itemEl.findall('./title')[0].text
2310 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2311
2312 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2313 compat_urllib_parse.urlencode({'uri': mediaId}))
2314 configXml = self._download_webpage(configUrl, epTitle,
2315 u'Downloading configuration for %s' % shortMediaId)
2316
2317 cdoc = xml.etree.ElementTree.fromstring(configXml)
2318 turls = []
2319 for rendition in cdoc.findall('.//rendition'):
2320 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2321 turls.append(finfo)
2322
2323 if len(turls) == 0:
2324 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2325 continue
2326
2327 if self._downloader.params.get('listformats', None):
2328 self._print_formats([i[0] for i in turls])
2329 return
2330
2331 # For now, just pick the highest bitrate
2332 format,rtmp_video_url = turls[-1]
2333
2334 # Get the format arg from the arg stream
2335 req_format = self._downloader.params.get('format', None)
2336
2337 # Select format if we can find one
2338 for f,v in turls:
2339 if f == req_format:
2340 format, rtmp_video_url = f, v
2341 break
2342
2343 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2344 if not m:
2345 raise ExtractorError(u'Cannot transform RTMP url')
2346 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2347 video_url = base + m.group('finalid')
2348
2349 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2350 info = {
2351 'id': shortMediaId,
2352 'url': video_url,
2353 'uploader': showId,
2354 'upload_date': officialDate,
2355 'title': effTitle,
2356 'ext': 'mp4',
2357 'format': format,
2358 'thumbnail': None,
2359 'description': officialTitle,
2360 }
2361 results.append(info)
2362
2363 return results
2364
2365
2366 class EscapistIE(InfoExtractor):
2367 """Information extractor for The Escapist """
2368
2369 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2370 IE_NAME = u'escapist'
2371
2372 def _real_extract(self, url):
2373 mobj = re.match(self._VALID_URL, url)
2374 if mobj is None:
2375 self._downloader.report_error(u'invalid URL: %s' % url)
2376 return
2377 showName = mobj.group('showname')
2378 videoId = mobj.group('episode')
2379
2380 self.report_extraction(showName)
2381 webPage = self._download_webpage(url, showName)
2382
2383 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2384 description = unescapeHTML(descMatch.group(1))
2385 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2386 imgUrl = unescapeHTML(imgMatch.group(1))
2387 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2388 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2389 configUrlMatch = re.search('config=(.*)$', playerUrl)
2390 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2391
2392 configJSON = self._download_webpage(configUrl, showName,
2393 u'Downloading configuration',
2394 u'unable to download configuration')
2395
2396 # Technically, it's JavaScript, not JSON
2397 configJSON = configJSON.replace("'", '"')
2398
2399 try:
2400 config = json.loads(configJSON)
2401 except (ValueError,) as err:
2402 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2403 return
2404
2405 playlist = config['playlist']
2406 videoUrl = playlist[1]['url']
2407
2408 info = {
2409 'id': videoId,
2410 'url': videoUrl,
2411 'uploader': showName,
2412 'upload_date': None,
2413 'title': showName,
2414 'ext': 'mp4',
2415 'thumbnail': imgUrl,
2416 'description': description,
2417 'player_url': playerUrl,
2418 }
2419
2420 return [info]
2421
2422 class CollegeHumorIE(InfoExtractor):
2423 """Information extractor for collegehumor.com"""
2424
2425 _WORKING = False
2426 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2427 IE_NAME = u'collegehumor'
2428
2429 def report_manifest(self, video_id):
2430 """Report information extraction."""
2431 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2432
2433 def _real_extract(self, url):
2434 mobj = re.match(self._VALID_URL, url)
2435 if mobj is None:
2436 self._downloader.report_error(u'invalid URL: %s' % url)
2437 return
2438 video_id = mobj.group('videoid')
2439
2440 info = {
2441 'id': video_id,
2442 'uploader': None,
2443 'upload_date': None,
2444 }
2445
2446 self.report_extraction(video_id)
2447 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2448 try:
2449 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2452 return
2453
2454 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2455 try:
2456 videoNode = mdoc.findall('./video')[0]
2457 info['description'] = videoNode.findall('./description')[0].text
2458 info['title'] = videoNode.findall('./caption')[0].text
2459 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2460 manifest_url = videoNode.findall('./file')[0].text
2461 except IndexError:
2462 self._downloader.report_error(u'Invalid metadata XML file')
2463 return
2464
2465 manifest_url += '?hdcore=2.10.3'
2466 self.report_manifest(video_id)
2467 try:
2468 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2471 return
2472
2473 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2474 try:
2475 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2476 node_id = media_node.attrib['url']
2477 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2478 except IndexError as err:
2479 self._downloader.report_error(u'Invalid manifest file')
2480 return
2481
2482 url_pr = compat_urllib_parse_urlparse(manifest_url)
2483 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2484
2485 info['url'] = url
2486 info['ext'] = 'f4f'
2487 return [info]
2488
2489
2490 class XVideosIE(InfoExtractor):
2491 """Information extractor for xvideos.com"""
2492
2493 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2494 IE_NAME = u'xvideos'
2495
2496 def _real_extract(self, url):
2497 mobj = re.match(self._VALID_URL, url)
2498 if mobj is None:
2499 self._downloader.report_error(u'invalid URL: %s' % url)
2500 return
2501 video_id = mobj.group(1)
2502
2503 webpage = self._download_webpage(url, video_id)
2504
2505 self.report_extraction(video_id)
2506
2507
2508 # Extract video URL
2509 mobj = re.search(r'flv_url=(.+?)&', webpage)
2510 if mobj is None:
2511 self._downloader.report_error(u'unable to extract video url')
2512 return
2513 video_url = compat_urllib_parse.unquote(mobj.group(1))
2514
2515
2516 # Extract title
2517 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2518 if mobj is None:
2519 self._downloader.report_error(u'unable to extract video title')
2520 return
2521 video_title = mobj.group(1)
2522
2523
2524 # Extract video thumbnail
2525 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2526 if mobj is None:
2527 self._downloader.report_error(u'unable to extract video thumbnail')
2528 return
2529 video_thumbnail = mobj.group(0)
2530
2531 info = {
2532 'id': video_id,
2533 'url': video_url,
2534 'uploader': None,
2535 'upload_date': None,
2536 'title': video_title,
2537 'ext': 'flv',
2538 'thumbnail': video_thumbnail,
2539 'description': None,
2540 }
2541
2542 return [info]
2543
2544
2545 class SoundcloudIE(InfoExtractor):
2546 """Information extractor for soundcloud.com
2547 To access the media, the uid of the song and a stream token
2548 must be extracted from the page source and the script must make
2549 a request to media.soundcloud.com/crossdomain.xml. Then
2550 the media can be grabbed by requesting from an url composed
2551 of the stream token and uid
2552 """
2553
2554 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2555 IE_NAME = u'soundcloud'
2556
2557 def report_resolve(self, video_id):
2558 """Report information extraction."""
2559 self.to_screen(u'%s: Resolving id' % video_id)
2560
2561 def _real_extract(self, url):
2562 mobj = re.match(self._VALID_URL, url)
2563 if mobj is None:
2564 self._downloader.report_error(u'invalid URL: %s' % url)
2565 return
2566
2567 # extract uploader (which is in the url)
2568 uploader = mobj.group(1)
2569 # extract simple title (uploader + slug of song title)
2570 slug_title = mobj.group(2)
2571 simple_title = uploader + u'-' + slug_title
2572 full_title = '%s/%s' % (uploader, slug_title)
2573
2574 self.report_resolve(full_title)
2575
2576 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2577 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2579
2580 info = json.loads(info_json)
2581 video_id = info['id']
2582 self.report_extraction(full_title)
2583
2584 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2585 stream_json = self._download_webpage(streams_url, full_title,
2586 u'Downloading stream definitions',
2587 u'unable to download stream definitions')
2588
2589 streams = json.loads(stream_json)
2590 mediaURL = streams['http_mp3_128_url']
2591 upload_date = unified_strdate(info['created_at'])
2592
2593 return [{
2594 'id': info['id'],
2595 'url': mediaURL,
2596 'uploader': info['user']['username'],
2597 'upload_date': upload_date,
2598 'title': info['title'],
2599 'ext': u'mp3',
2600 'description': info['description'],
2601 }]
2602
2603 class SoundcloudSetIE(InfoExtractor):
2604 """Information extractor for soundcloud.com sets
2605 To access the media, the uid of the song and a stream token
2606 must be extracted from the page source and the script must make
2607 a request to media.soundcloud.com/crossdomain.xml. Then
2608 the media can be grabbed by requesting from an url composed
2609 of the stream token and uid
2610 """
2611
2612 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2613 IE_NAME = u'soundcloud:set'
2614
2615 def report_resolve(self, video_id):
2616 """Report information extraction."""
2617 self.to_screen(u'%s: Resolving id' % video_id)
2618
2619 def _real_extract(self, url):
2620 mobj = re.match(self._VALID_URL, url)
2621 if mobj is None:
2622 self._downloader.report_error(u'invalid URL: %s' % url)
2623 return
2624
2625 # extract uploader (which is in the url)
2626 uploader = mobj.group(1)
2627 # extract simple title (uploader + slug of song title)
2628 slug_title = mobj.group(2)
2629 simple_title = uploader + u'-' + slug_title
2630 full_title = '%s/sets/%s' % (uploader, slug_title)
2631
2632 self.report_resolve(full_title)
2633
2634 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2635 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2636 info_json = self._download_webpage(resolv_url, full_title)
2637
2638 videos = []
2639 info = json.loads(info_json)
2640 if 'errors' in info:
2641 for err in info['errors']:
2642 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2643 return
2644
2645 self.report_extraction(full_title)
2646 for track in info['tracks']:
2647 video_id = track['id']
2648
2649 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2651
2652 self.report_extraction(video_id)
2653 streams = json.loads(stream_json)
2654 mediaURL = streams['http_mp3_128_url']
2655
2656 videos.append({
2657 'id': video_id,
2658 'url': mediaURL,
2659 'uploader': track['user']['username'],
2660 'upload_date': unified_strdate(track['created_at']),
2661 'title': track['title'],
2662 'ext': u'mp3',
2663 'description': track['description'],
2664 })
2665 return videos
2666
2667
2668 class InfoQIE(InfoExtractor):
2669 """Information extractor for infoq.com"""
2670 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2671
2672 def _real_extract(self, url):
2673 mobj = re.match(self._VALID_URL, url)
2674 if mobj is None:
2675 self._downloader.report_error(u'invalid URL: %s' % url)
2676 return
2677
2678 webpage = self._download_webpage(url, video_id=url)
2679 self.report_extraction(url)
2680
2681 # Extract video URL
2682 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2683 if mobj is None:
2684 self._downloader.report_error(u'unable to extract video url')
2685 return
2686 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2687 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2688
2689 # Extract title
2690 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2691 if mobj is None:
2692 self._downloader.report_error(u'unable to extract video title')
2693 return
2694 video_title = mobj.group(1)
2695
2696 # Extract description
2697 video_description = u'No description available.'
2698 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2699 if mobj is not None:
2700 video_description = mobj.group(1)
2701
2702 video_filename = video_url.split('/')[-1]
2703 video_id, extension = video_filename.split('.')
2704
2705 info = {
2706 'id': video_id,
2707 'url': video_url,
2708 'uploader': None,
2709 'upload_date': None,
2710 'title': video_title,
2711 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2712 'thumbnail': None,
2713 'description': video_description,
2714 }
2715
2716 return [info]
2717
2718 class MixcloudIE(InfoExtractor):
2719 """Information extractor for www.mixcloud.com"""
2720
2721 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2722 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2723 IE_NAME = u'mixcloud'
2724
2725 def report_download_json(self, file_id):
2726 """Report JSON download."""
2727 self.to_screen(u'Downloading json')
2728
2729 def get_urls(self, jsonData, fmt, bitrate='best'):
2730 """Get urls from 'audio_formats' section in json"""
2731 file_url = None
2732 try:
2733 bitrate_list = jsonData[fmt]
2734 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2735 bitrate = max(bitrate_list) # select highest
2736
2737 url_list = jsonData[fmt][bitrate]
2738 except TypeError: # we have no bitrate info.
2739 url_list = jsonData[fmt]
2740 return url_list
2741
2742 def check_urls(self, url_list):
2743 """Returns 1st active url from list"""
2744 for url in url_list:
2745 try:
2746 compat_urllib_request.urlopen(url)
2747 return url
2748 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2749 url = None
2750
2751 return None
2752
2753 def _print_formats(self, formats):
2754 print('Available formats:')
2755 for fmt in formats.keys():
2756 for b in formats[fmt]:
2757 try:
2758 ext = formats[fmt][b][0]
2759 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2760 except TypeError: # we have no bitrate info
2761 ext = formats[fmt][0]
2762 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2763 break
2764
2765 def _real_extract(self, url):
2766 mobj = re.match(self._VALID_URL, url)
2767 if mobj is None:
2768 self._downloader.report_error(u'invalid URL: %s' % url)
2769 return
2770 # extract uploader & filename from url
2771 uploader = mobj.group(1).decode('utf-8')
2772 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2773
2774 # construct API request
2775 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2776 # retrieve .json file with links to files
2777 request = compat_urllib_request.Request(file_url)
2778 try:
2779 self.report_download_json(file_url)
2780 jsonData = compat_urllib_request.urlopen(request).read()
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2783 return
2784
2785 # parse JSON
2786 json_data = json.loads(jsonData)
2787 player_url = json_data['player_swf_url']
2788 formats = dict(json_data['audio_formats'])
2789
2790 req_format = self._downloader.params.get('format', None)
2791 bitrate = None
2792
2793 if self._downloader.params.get('listformats', None):
2794 self._print_formats(formats)
2795 return
2796
2797 if req_format is None or req_format == 'best':
2798 for format_param in formats.keys():
2799 url_list = self.get_urls(formats, format_param)
2800 # check urls
2801 file_url = self.check_urls(url_list)
2802 if file_url is not None:
2803 break # got it!
2804 else:
2805 if req_format not in formats:
2806 self._downloader.report_error(u'format is not available')
2807 return
2808
2809 url_list = self.get_urls(formats, req_format)
2810 file_url = self.check_urls(url_list)
2811 format_param = req_format
2812
2813 return [{
2814 'id': file_id.decode('utf-8'),
2815 'url': file_url.decode('utf-8'),
2816 'uploader': uploader.decode('utf-8'),
2817 'upload_date': None,
2818 'title': json_data['name'],
2819 'ext': file_url.split('.')[-1].decode('utf-8'),
2820 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2821 'thumbnail': json_data['thumbnail_url'],
2822 'description': json_data['description'],
2823 'player_url': player_url.decode('utf-8'),
2824 }]
2825
2826 class StanfordOpenClassroomIE(InfoExtractor):
2827 """Information extractor for Stanford's Open ClassRoom"""
2828
2829 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2830 IE_NAME = u'stanfordoc'
2831
2832 def _real_extract(self, url):
2833 mobj = re.match(self._VALID_URL, url)
2834 if mobj is None:
2835 raise ExtractorError(u'Invalid URL: %s' % url)
2836
2837 if mobj.group('course') and mobj.group('video'): # A specific video
2838 course = mobj.group('course')
2839 video = mobj.group('video')
2840 info = {
2841 'id': course + '_' + video,
2842 'uploader': None,
2843 'upload_date': None,
2844 }
2845
2846 self.report_extraction(info['id'])
2847 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2848 xmlUrl = baseUrl + video + '.xml'
2849 try:
2850 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2851 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2852 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2853 return
2854 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2855 try:
2856 info['title'] = mdoc.findall('./title')[0].text
2857 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2858 except IndexError:
2859 self._downloader.report_error(u'Invalid metadata XML file')
2860 return
2861 info['ext'] = info['url'].rpartition('.')[2]
2862 return [info]
2863 elif mobj.group('course'): # A course page
2864 course = mobj.group('course')
2865 info = {
2866 'id': course,
2867 'type': 'playlist',
2868 'uploader': None,
2869 'upload_date': None,
2870 }
2871
2872 coursepage = self._download_webpage(url, info['id'],
2873 note='Downloading course info page',
2874 errnote='Unable to download course info page')
2875
2876 m = re.search('<h1>([^<]+)</h1>', coursepage)
2877 if m:
2878 info['title'] = unescapeHTML(m.group(1))
2879 else:
2880 info['title'] = info['id']
2881
2882 m = re.search('<description>([^<]+)</description>', coursepage)
2883 if m:
2884 info['description'] = unescapeHTML(m.group(1))
2885
2886 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2887 info['list'] = [
2888 {
2889 'type': 'reference',
2890 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2891 }
2892 for vpage in links]
2893 results = []
2894 for entry in info['list']:
2895 assert entry['type'] == 'reference'
2896 results += self.extract(entry['url'])
2897 return results
2898 else: # Root page
2899 info = {
2900 'id': 'Stanford OpenClassroom',
2901 'type': 'playlist',
2902 'uploader': None,
2903 'upload_date': None,
2904 }
2905
2906 self.report_download_webpage(info['id'])
2907 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2908 try:
2909 rootpage = compat_urllib_request.urlopen(rootURL).read()
2910 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2911 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2912 return
2913
2914 info['title'] = info['id']
2915
2916 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2917 info['list'] = [
2918 {
2919 'type': 'reference',
2920 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2921 }
2922 for cpage in links]
2923
2924 results = []
2925 for entry in info['list']:
2926 assert entry['type'] == 'reference'
2927 results += self.extract(entry['url'])
2928 return results
2929
2930 class MTVIE(InfoExtractor):
2931 """Information extractor for MTV.com"""
2932
2933 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2934 IE_NAME = u'mtv'
2935
2936 def _real_extract(self, url):
2937 mobj = re.match(self._VALID_URL, url)
2938 if mobj is None:
2939 self._downloader.report_error(u'invalid URL: %s' % url)
2940 return
2941 if not mobj.group('proto'):
2942 url = 'http://' + url
2943 video_id = mobj.group('videoid')
2944
2945 webpage = self._download_webpage(url, video_id)
2946
2947 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2948 if mobj is None:
2949 self._downloader.report_error(u'unable to extract song name')
2950 return
2951 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2952 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2953 if mobj is None:
2954 self._downloader.report_error(u'unable to extract performer')
2955 return
2956 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2957 video_title = performer + ' - ' + song_name
2958
2959 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2960 if mobj is None:
2961 self._downloader.report_error(u'unable to mtvn_uri')
2962 return
2963 mtvn_uri = mobj.group(1)
2964
2965 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2966 if mobj is None:
2967 self._downloader.report_error(u'unable to extract content id')
2968 return
2969 content_id = mobj.group(1)
2970
2971 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2972 self.report_extraction(video_id)
2973 request = compat_urllib_request.Request(videogen_url)
2974 try:
2975 metadataXml = compat_urllib_request.urlopen(request).read()
2976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2977 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2978 return
2979
2980 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2981 renditions = mdoc.findall('.//rendition')
2982
2983 # For now, always pick the highest quality.
2984 rendition = renditions[-1]
2985
2986 try:
2987 _,_,ext = rendition.attrib['type'].partition('/')
2988 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2989 video_url = rendition.find('./src').text
2990 except KeyError:
2991 self._downloader.report_error('Invalid rendition field.')
2992 return
2993
2994 info = {
2995 'id': video_id,
2996 'url': video_url,
2997 'uploader': performer,
2998 'upload_date': None,
2999 'title': video_title,
3000 'ext': ext,
3001 'format': format,
3002 }
3003
3004 return [info]
3005
3006
3007 class YoukuIE(InfoExtractor):
3008 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3009
3010 def _gen_sid(self):
3011 nowTime = int(time.time() * 1000)
3012 random1 = random.randint(1000,1998)
3013 random2 = random.randint(1000,9999)
3014
3015 return "%d%d%d" %(nowTime,random1,random2)
3016
3017 def _get_file_ID_mix_string(self, seed):
3018 mixed = []
3019 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3020 seed = float(seed)
3021 for i in range(len(source)):
3022 seed = (seed * 211 + 30031 ) % 65536
3023 index = math.floor(seed / 65536 * len(source) )
3024 mixed.append(source[int(index)])
3025 source.remove(source[int(index)])
3026 #return ''.join(mixed)
3027 return mixed
3028
3029 def _get_file_id(self, fileId, seed):
3030 mixed = self._get_file_ID_mix_string(seed)
3031 ids = fileId.split('*')
3032 realId = []
3033 for ch in ids:
3034 if ch:
3035 realId.append(mixed[int(ch)])
3036 return ''.join(realId)
3037
3038 def _real_extract(self, url):
3039 mobj = re.match(self._VALID_URL, url)
3040 if mobj is None:
3041 self._downloader.report_error(u'invalid URL: %s' % url)
3042 return
3043 video_id = mobj.group('ID')
3044
3045 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3046
3047 jsondata = self._download_webpage(info_url, video_id)
3048
3049 self.report_extraction(video_id)
3050 try:
3051 config = json.loads(jsondata)
3052
3053 video_title = config['data'][0]['title']
3054 seed = config['data'][0]['seed']
3055
3056 format = self._downloader.params.get('format', None)
3057 supported_format = list(config['data'][0]['streamfileids'].keys())
3058
3059 if format is None or format == 'best':
3060 if 'hd2' in supported_format:
3061 format = 'hd2'
3062 else:
3063 format = 'flv'
3064 ext = u'flv'
3065 elif format == 'worst':
3066 format = 'mp4'
3067 ext = u'mp4'
3068 else:
3069 format = 'flv'
3070 ext = u'flv'
3071
3072
3073 fileid = config['data'][0]['streamfileids'][format]
3074 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3075 except (UnicodeDecodeError, ValueError, KeyError):
3076 self._downloader.report_error(u'unable to extract info section')
3077 return
3078
3079 files_info=[]
3080 sid = self._gen_sid()
3081 fileid = self._get_file_id(fileid, seed)
3082
3083 #column 8,9 of fileid represent the segment number
3084 #fileid[7:9] should be changed
3085 for index, key in enumerate(keys):
3086
3087 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3088 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3089
3090 info = {
3091 'id': '%s_part%02d' % (video_id, index),
3092 'url': download_url,
3093 'uploader': None,
3094 'upload_date': None,
3095 'title': video_title,
3096 'ext': ext,
3097 }
3098 files_info.append(info)
3099
3100 return files_info
3101
3102
3103 class XNXXIE(InfoExtractor):
3104 """Information extractor for xnxx.com"""
3105
3106 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3107 IE_NAME = u'xnxx'
3108 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3109 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3110 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3111
3112 def _real_extract(self, url):
3113 mobj = re.match(self._VALID_URL, url)
3114 if mobj is None:
3115 self._downloader.report_error(u'invalid URL: %s' % url)
3116 return
3117 video_id = mobj.group(1)
3118
3119 # Get webpage content
3120 webpage = self._download_webpage(url, video_id)
3121
3122 result = re.search(self.VIDEO_URL_RE, webpage)
3123 if result is None:
3124 self._downloader.report_error(u'unable to extract video url')
3125 return
3126 video_url = compat_urllib_parse.unquote(result.group(1))
3127
3128 result = re.search(self.VIDEO_TITLE_RE, webpage)
3129 if result is None:
3130 self._downloader.report_error(u'unable to extract video title')
3131 return
3132 video_title = result.group(1)
3133
3134 result = re.search(self.VIDEO_THUMB_RE, webpage)
3135 if result is None:
3136 self._downloader.report_error(u'unable to extract video thumbnail')
3137 return
3138 video_thumbnail = result.group(1)
3139
3140 return [{
3141 'id': video_id,
3142 'url': video_url,
3143 'uploader': None,
3144 'upload_date': None,
3145 'title': video_title,
3146 'ext': 'flv',
3147 'thumbnail': video_thumbnail,
3148 'description': None,
3149 }]
3150
3151
3152 class GooglePlusIE(InfoExtractor):
3153 """Information extractor for plus.google.com."""
3154
3155 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3156 IE_NAME = u'plus.google'
3157
3158 def report_extract_entry(self, url):
3159 """Report downloading extry"""
3160 self.to_screen(u'Downloading entry: %s' % url)
3161
3162 def report_date(self, upload_date):
3163 """Report downloading extry"""
3164 self.to_screen(u'Entry date: %s' % upload_date)
3165
3166 def report_uploader(self, uploader):
3167 """Report downloading extry"""
3168 self.to_screen(u'Uploader: %s' % uploader)
3169
3170 def report_title(self, video_title):
3171 """Report downloading extry"""
3172 self.to_screen(u'Title: %s' % video_title)
3173
3174 def report_extract_vid_page(self, video_page):
3175 """Report information extraction."""
3176 self.to_screen(u'Extracting video page: %s' % video_page)
3177
3178 def _real_extract(self, url):
3179 # Extract id from URL
3180 mobj = re.match(self._VALID_URL, url)
3181 if mobj is None:
3182 self._downloader.report_error(u'Invalid URL: %s' % url)
3183 return
3184
3185 post_url = mobj.group(0)
3186 video_id = mobj.group(1)
3187
3188 video_extension = 'flv'
3189
3190 # Step 1, Retrieve post webpage to extract further information
3191 self.report_extract_entry(post_url)
3192 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3193
3194 # Extract update date
3195 upload_date = None
3196 pattern = 'title="Timestamp">(.*?)</a>'
3197 mobj = re.search(pattern, webpage)
3198 if mobj:
3199 upload_date = mobj.group(1)
3200 # Convert timestring to a format suitable for filename
3201 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3202 upload_date = upload_date.strftime('%Y%m%d')
3203 self.report_date(upload_date)
3204
3205 # Extract uploader
3206 uploader = None
3207 pattern = r'rel\="author".*?>(.*?)</a>'
3208 mobj = re.search(pattern, webpage)
3209 if mobj:
3210 uploader = mobj.group(1)
3211 self.report_uploader(uploader)
3212
3213 # Extract title
3214 # Get the first line for title
3215 video_title = u'NA'
3216 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3217 mobj = re.search(pattern, webpage)
3218 if mobj:
3219 video_title = mobj.group(1)
3220 self.report_title(video_title)
3221
3222 # Step 2, Stimulate clicking the image box to launch video
3223 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3224 mobj = re.search(pattern, webpage)
3225 if mobj is None:
3226 self._downloader.report_error(u'unable to extract video page URL')
3227
3228 video_page = mobj.group(1)
3229 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3230 self.report_extract_vid_page(video_page)
3231
3232
3233 # Extract video links on video page
3234 """Extract video links of all sizes"""
3235 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3236 mobj = re.findall(pattern, webpage)
3237 if len(mobj) == 0:
3238 self._downloader.report_error(u'unable to extract video links')
3239
3240 # Sort in resolution
3241 links = sorted(mobj)
3242
3243 # Choose the lowest of the sort, i.e. highest resolution
3244 video_url = links[-1]
3245 # Only get the url. The resolution part in the tuple has no use anymore
3246 video_url = video_url[-1]
3247 # Treat escaped \u0026 style hex
3248 try:
3249 video_url = video_url.decode("unicode_escape")
3250 except AttributeError: # Python 3
3251 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3252
3253
3254 return [{
3255 'id': video_id,
3256 'url': video_url,
3257 'uploader': uploader,
3258 'upload_date': upload_date,
3259 'title': video_title,
3260 'ext': video_extension,
3261 }]
3262
3263 class NBAIE(InfoExtractor):
3264 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3265 IE_NAME = u'nba'
3266
3267 def _real_extract(self, url):
3268 mobj = re.match(self._VALID_URL, url)
3269 if mobj is None:
3270 self._downloader.report_error(u'invalid URL: %s' % url)
3271 return
3272
3273 video_id = mobj.group(1)
3274 if video_id.endswith('/index.html'):
3275 video_id = video_id[:-len('/index.html')]
3276
3277 webpage = self._download_webpage(url, video_id)
3278
3279 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3280 def _findProp(rexp, default=None):
3281 m = re.search(rexp, webpage)
3282 if m:
3283 return unescapeHTML(m.group(1))
3284 else:
3285 return default
3286
3287 shortened_video_id = video_id.rpartition('/')[2]
3288 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3289 info = {
3290 'id': shortened_video_id,
3291 'url': video_url,
3292 'ext': 'mp4',
3293 'title': title,
3294 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3295 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3296 }
3297 return [info]
3298
3299 class JustinTVIE(InfoExtractor):
3300 """Information extractor for justin.tv and twitch.tv"""
3301 # TODO: One broadcast may be split into multiple videos. The key
3302 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3303 # starts at 1 and increases. Can we treat all parts as one video?
3304
3305 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3306 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3307 _JUSTIN_PAGE_LIMIT = 100
3308 IE_NAME = u'justin.tv'
3309
3310 def report_download_page(self, channel, offset):
3311 """Report attempt to download a single page of videos."""
3312 self.to_screen(u'%s: Downloading video information from %d to %d' %
3313 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3314
3315 # Return count of items, list of *valid* items
3316 def _parse_page(self, url, video_id):
3317 webpage = self._download_webpage(url, video_id,
3318 u'Downloading video info JSON',
3319 u'unable to download video info JSON')
3320
3321 response = json.loads(webpage)
3322 if type(response) != list:
3323 error_text = response.get('error', 'unknown error')
3324 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3325 return
3326 info = []
3327 for clip in response:
3328 video_url = clip['video_file_url']
3329 if video_url:
3330 video_extension = os.path.splitext(video_url)[1][1:]
3331 video_date = re.sub('-', '', clip['start_time'][:10])
3332 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3333 video_id = clip['id']
3334 video_title = clip.get('title', video_id)
3335 info.append({
3336 'id': video_id,
3337 'url': video_url,
3338 'title': video_title,
3339 'uploader': clip.get('channel_name', video_uploader_id),
3340 'uploader_id': video_uploader_id,
3341 'upload_date': video_date,
3342 'ext': video_extension,
3343 })
3344 return (len(response), info)
3345
3346 def _real_extract(self, url):
3347 mobj = re.match(self._VALID_URL, url)
3348 if mobj is None:
3349 self._downloader.report_error(u'invalid URL: %s' % url)
3350 return
3351
3352 api = 'http://api.justin.tv'
3353 video_id = mobj.group(mobj.lastindex)
3354 paged = False
3355 if mobj.lastindex == 1:
3356 paged = True
3357 api += '/channel/archives/%s.json'
3358 else:
3359 api += '/broadcast/by_archive/%s.json'
3360 api = api % (video_id,)
3361
3362 self.report_extraction(video_id)
3363
3364 info = []
3365 offset = 0
3366 limit = self._JUSTIN_PAGE_LIMIT
3367 while True:
3368 if paged:
3369 self.report_download_page(video_id, offset)
3370 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3371 page_count, page_info = self._parse_page(page_url, video_id)
3372 info.extend(page_info)
3373 if not paged or page_count != limit:
3374 break
3375 offset += limit
3376 return info
3377
3378 class FunnyOrDieIE(InfoExtractor):
3379 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3380
3381 def _real_extract(self, url):
3382 mobj = re.match(self._VALID_URL, url)
3383 if mobj is None:
3384 self._downloader.report_error(u'invalid URL: %s' % url)
3385 return
3386
3387 video_id = mobj.group('id')
3388 webpage = self._download_webpage(url, video_id)
3389
3390 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3391 if not m:
3392 self._downloader.report_error(u'unable to find video information')
3393 video_url = unescapeHTML(m.group('url'))
3394
3395 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3396 if not m:
3397 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3398 if not m:
3399 self._downloader.report_error(u'Cannot find video title')
3400 title = clean_html(m.group('title'))
3401
3402 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3403 if m:
3404 desc = unescapeHTML(m.group('desc'))
3405 else:
3406 desc = None
3407
3408 info = {
3409 'id': video_id,
3410 'url': video_url,
3411 'ext': 'mp4',
3412 'title': title,
3413 'description': desc,
3414 }
3415 return [info]
3416
3417 class SteamIE(InfoExtractor):
3418 _VALID_URL = r"""http://store\.steampowered\.com/
3419 (agecheck/)?
3420 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3421 (?P<gameID>\d+)/?
3422 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3423 """
3424
3425 @classmethod
3426 def suitable(cls, url):
3427 """Receives a URL and returns True if suitable for this IE."""
3428 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3429
3430 def _real_extract(self, url):
3431 m = re.match(self._VALID_URL, url, re.VERBOSE)
3432 gameID = m.group('gameID')
3433 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3434 self.report_age_confirmation()
3435 webpage = self._download_webpage(videourl, gameID)
3436 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3437
3438 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3439 mweb = re.finditer(urlRE, webpage)
3440 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3441 titles = re.finditer(namesRE, webpage)
3442 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3443 thumbs = re.finditer(thumbsRE, webpage)
3444 videos = []
3445 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3446 video_id = vid.group('videoID')
3447 title = vtitle.group('videoName')
3448 video_url = vid.group('videoURL')
3449 video_thumb = thumb.group('thumbnail')
3450 if not video_url:
3451 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3452 info = {
3453 'id':video_id,
3454 'url':video_url,
3455 'ext': 'flv',
3456 'title': unescapeHTML(title),
3457 'thumbnail': video_thumb
3458 }
3459 videos.append(info)
3460 return [self.playlist_result(videos, gameID, game_title)]
3461
3462 class UstreamIE(InfoExtractor):
3463 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3464 IE_NAME = u'ustream'
3465
3466 def _real_extract(self, url):
3467 m = re.match(self._VALID_URL, url)
3468 video_id = m.group('videoID')
3469 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3470 webpage = self._download_webpage(url, video_id)
3471 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3472 title = m.group('title')
3473 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3474 uploader = m.group('uploader')
3475 info = {
3476 'id':video_id,
3477 'url':video_url,
3478 'ext': 'flv',
3479 'title': title,
3480 'uploader': uploader
3481 }
3482 return [info]
3483
3484 class WorldStarHipHopIE(InfoExtractor):
3485 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3486 IE_NAME = u'WorldStarHipHop'
3487
3488 def _real_extract(self, url):
3489 _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3490
3491 m = re.match(self._VALID_URL, url)
3492 video_id = m.group('id')
3493
3494 webpage_src = self._download_webpage(url, video_id)
3495
3496 mobj = re.search(_src_url, webpage_src)
3497
3498 if mobj is not None:
3499 video_url = mobj.group()
3500 if 'mp4' in video_url:
3501 ext = 'mp4'
3502 else:
3503 ext = 'flv'
3504 else:
3505 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3506 return
3507
3508 _title = r"""<title>(.*)</title>"""
3509
3510 mobj = re.search(_title, webpage_src)
3511
3512 if mobj is not None:
3513 title = mobj.group(1)
3514 else:
3515 title = 'World Start Hip Hop - %s' % time.ctime()
3516
3517 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3518 mobj = re.search(_thumbnail, webpage_src)
3519
3520 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3521 if mobj is not None:
3522 thumbnail = mobj.group(1)
3523 else:
3524 _title = r"""candytitles.*>(.*)</span>"""
3525 mobj = re.search(_title, webpage_src)
3526 if mobj is not None:
3527 title = mobj.group(1)
3528 thumbnail = None
3529
3530 results = [{
3531 'id': video_id,
3532 'url' : video_url,
3533 'title' : title,
3534 'thumbnail' : thumbnail,
3535 'ext' : ext,
3536 }]
3537 return results
3538
3539 class RBMARadioIE(InfoExtractor):
3540 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3541
3542 def _real_extract(self, url):
3543 m = re.match(self._VALID_URL, url)
3544 video_id = m.group('videoID')
3545
3546 webpage = self._download_webpage(url, video_id)
3547 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3548 if not m:
3549 raise ExtractorError(u'Cannot find metadata')
3550 json_data = m.group(1)
3551
3552 try:
3553 data = json.loads(json_data)
3554 except ValueError as e:
3555 raise ExtractorError(u'Invalid JSON: ' + str(e))
3556
3557 video_url = data['akamai_url'] + '&cbr=256'
3558 url_parts = compat_urllib_parse_urlparse(video_url)
3559 video_ext = url_parts.path.rpartition('.')[2]
3560 info = {
3561 'id': video_id,
3562 'url': video_url,
3563 'ext': video_ext,
3564 'title': data['title'],
3565 'description': data.get('teaser_text'),
3566 'location': data.get('country_of_origin'),
3567 'uploader': data.get('host', {}).get('name'),
3568 'uploader_id': data.get('host', {}).get('slug'),
3569 'thumbnail': data.get('image', {}).get('large_url_2x'),
3570 'duration': data.get('duration'),
3571 }
3572 return [info]
3573
3574
3575 class YouPornIE(InfoExtractor):
3576 """Information extractor for youporn.com."""
3577 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3578
3579 def _print_formats(self, formats):
3580 """Print all available formats"""
3581 print(u'Available formats:')
3582 print(u'ext\t\tformat')
3583 print(u'---------------------------------')
3584 for format in formats:
3585 print(u'%s\t\t%s' % (format['ext'], format['format']))
3586
3587 def _specific(self, req_format, formats):
3588 for x in formats:
3589 if(x["format"]==req_format):
3590 return x
3591 return None
3592
3593 def _real_extract(self, url):
3594 mobj = re.match(self._VALID_URL, url)
3595 if mobj is None:
3596 self._downloader.report_error(u'invalid URL: %s' % url)
3597 return
3598
3599 video_id = mobj.group('videoid')
3600
3601 req = compat_urllib_request.Request(url)
3602 req.add_header('Cookie', 'age_verified=1')
3603 webpage = self._download_webpage(req, video_id)
3604
3605 # Get the video title
3606 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3607 if result is None:
3608 raise ExtractorError(u'Unable to extract video title')
3609 video_title = result.group('title').strip()
3610
3611 # Get the video date
3612 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3613 if result is None:
3614 self._downloader.report_warning(u'unable to extract video date')
3615 upload_date = None
3616 else:
3617 upload_date = unified_strdate(result.group('date').strip())
3618
3619 # Get the video uploader
3620 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3621 if result is None:
3622 self._downloader.report_warning(u'unable to extract uploader')
3623 video_uploader = None
3624 else:
3625 video_uploader = result.group('uploader').strip()
3626 video_uploader = clean_html( video_uploader )
3627
3628 # Get all of the formats available
3629 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3630 result = re.search(DOWNLOAD_LIST_RE, webpage)
3631 if result is None:
3632 raise ExtractorError(u'Unable to extract download list')
3633 download_list_html = result.group('download_list').strip()
3634
3635 # Get all of the links from the page
3636 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3637 links = re.findall(LINK_RE, download_list_html)
3638 if(len(links) == 0):
3639 raise ExtractorError(u'ERROR: no known formats available for video')
3640
3641 self.to_screen(u'Links found: %d' % len(links))
3642
3643 formats = []
3644 for link in links:
3645
3646 # A link looks like this:
3647 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3648 # A path looks like this:
3649 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3650 video_url = unescapeHTML( link )
3651 path = compat_urllib_parse_urlparse( video_url ).path
3652 extension = os.path.splitext( path )[1][1:]
3653 format = path.split('/')[4].split('_')[:2]
3654 size = format[0]
3655 bitrate = format[1]
3656 format = "-".join( format )
3657 title = u'%s-%s-%s' % (video_title, size, bitrate)
3658
3659 formats.append({
3660 'id': video_id,
3661 'url': video_url,
3662 'uploader': video_uploader,
3663 'upload_date': upload_date,
3664 'title': title,
3665 'ext': extension,
3666 'format': format,
3667 'thumbnail': None,
3668 'description': None,
3669 'player_url': None
3670 })
3671
3672 if self._downloader.params.get('listformats', None):
3673 self._print_formats(formats)
3674 return
3675
3676 req_format = self._downloader.params.get('format', None)
3677 self.to_screen(u'Format: %s' % req_format)
3678
3679 if req_format is None or req_format == 'best':
3680 return [formats[0]]
3681 elif req_format == 'worst':
3682 return [formats[-1]]
3683 elif req_format in ('-1', 'all'):
3684 return formats
3685 else:
3686 format = self._specific( req_format, formats )
3687 if result is None:
3688 self._downloader.report_error(u'requested format not available')
3689 return
3690 return [format]
3691
3692
3693
3694 class PornotubeIE(InfoExtractor):
3695 """Information extractor for pornotube.com."""
3696 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3697
3698 def _real_extract(self, url):
3699 mobj = re.match(self._VALID_URL, url)
3700 if mobj is None:
3701 self._downloader.report_error(u'invalid URL: %s' % url)
3702 return
3703
3704 video_id = mobj.group('videoid')
3705 video_title = mobj.group('title')
3706
3707 # Get webpage content
3708 webpage = self._download_webpage(url, video_id)
3709
3710 # Get the video URL
3711 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3712 result = re.search(VIDEO_URL_RE, webpage)
3713 if result is None:
3714 self._downloader.report_error(u'unable to extract video url')
3715 return
3716 video_url = compat_urllib_parse.unquote(result.group('url'))
3717
3718 #Get the uploaded date
3719 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3720 result = re.search(VIDEO_UPLOADED_RE, webpage)
3721 if result is None:
3722 self._downloader.report_error(u'unable to extract video title')
3723 return
3724 upload_date = unified_strdate(result.group('date'))
3725
3726 info = {'id': video_id,
3727 'url': video_url,
3728 'uploader': None,
3729 'upload_date': upload_date,
3730 'title': video_title,
3731 'ext': 'flv',
3732 'format': 'flv'}
3733
3734 return [info]
3735
3736 class YouJizzIE(InfoExtractor):
3737 """Information extractor for youjizz.com."""
3738 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3739
3740 def _real_extract(self, url):
3741 mobj = re.match(self._VALID_URL, url)
3742 if mobj is None:
3743 self._downloader.report_error(u'invalid URL: %s' % url)
3744 return
3745
3746 video_id = mobj.group('videoid')
3747
3748 # Get webpage content
3749 webpage = self._download_webpage(url, video_id)
3750
3751 # Get the video title
3752 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3753 if result is None:
3754 raise ExtractorError(u'ERROR: unable to extract video title')
3755 video_title = result.group('title').strip()
3756
3757 # Get the embed page
3758 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3759 if result is None:
3760 raise ExtractorError(u'ERROR: unable to extract embed page')
3761
3762 embed_page_url = result.group(0).strip()
3763 video_id = result.group('videoid')
3764
3765 webpage = self._download_webpage(embed_page_url, video_id)
3766
3767 # Get the video URL
3768 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3769 if result is None:
3770 raise ExtractorError(u'ERROR: unable to extract video url')
3771 video_url = result.group('source')
3772
3773 info = {'id': video_id,
3774 'url': video_url,
3775 'title': video_title,
3776 'ext': 'flv',
3777 'format': 'flv',
3778 'player_url': embed_page_url}
3779
3780 return [info]
3781
3782 class EightTracksIE(InfoExtractor):
3783 IE_NAME = '8tracks'
3784 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3785
3786 def _real_extract(self, url):
3787 mobj = re.match(self._VALID_URL, url)
3788 if mobj is None:
3789 raise ExtractorError(u'Invalid URL: %s' % url)
3790 playlist_id = mobj.group('id')
3791
3792 webpage = self._download_webpage(url, playlist_id)
3793
3794 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3795 if not m:
3796 raise ExtractorError(u'Cannot find trax information')
3797 json_like = m.group(1)
3798 data = json.loads(json_like)
3799
3800 session = str(random.randint(0, 1000000000))
3801 mix_id = data['id']
3802 track_count = data['tracks_count']
3803 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3804 next_url = first_url
3805 res = []
3806 for i in itertools.count():
3807 api_json = self._download_webpage(next_url, playlist_id,
3808 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3809 errnote=u'Failed to download song information')
3810 api_data = json.loads(api_json)
3811 track_data = api_data[u'set']['track']
3812 info = {
3813 'id': track_data['id'],
3814 'url': track_data['track_file_stream_url'],
3815 'title': track_data['performer'] + u' - ' + track_data['name'],
3816 'raw_title': track_data['name'],
3817 'uploader_id': data['user']['login'],
3818 'ext': 'm4a',
3819 }
3820 res.append(info)
3821 if api_data['set']['at_last_track']:
3822 break
3823 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3824 return res
3825
3826 class KeekIE(InfoExtractor):
3827 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3828 IE_NAME = u'keek'
3829
3830 def _real_extract(self, url):
3831 m = re.match(self._VALID_URL, url)
3832 video_id = m.group('videoID')
3833 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3834 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3835 webpage = self._download_webpage(url, video_id)
3836 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3837 title = unescapeHTML(m.group('title'))
3838 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3839 uploader = clean_html(m.group('uploader'))
3840 info = {
3841 'id': video_id,
3842 'url': video_url,
3843 'ext': 'mp4',
3844 'title': title,
3845 'thumbnail': thumbnail,
3846 'uploader': uploader
3847 }
3848 return [info]
3849
3850 class TEDIE(InfoExtractor):
3851 _VALID_URL=r'''http://www\.ted\.com/
3852 (
3853 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3854 |
3855 ((?P<type_talk>talks)) # We have a simple talk
3856 )
3857 /(?P<name>\w+) # Here goes the name and then ".html"
3858 '''
3859
3860 @classmethod
3861 def suitable(cls, url):
3862 """Receives a URL and returns True if suitable for this IE."""
3863 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3864
3865 def _real_extract(self, url):
3866 m=re.match(self._VALID_URL, url, re.VERBOSE)
3867 if m.group('type_talk'):
3868 return [self._talk_info(url)]
3869 else :
3870 playlist_id=m.group('playlist_id')
3871 name=m.group('name')
3872 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3873 return [self._playlist_videos_info(url,name,playlist_id)]
3874
3875 def _talk_video_link(self,mediaSlug):
3876 '''Returns the video link for that mediaSlug'''
3877 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3878
3879 def _playlist_videos_info(self,url,name,playlist_id=0):
3880 '''Returns the videos of the playlist'''
3881 video_RE=r'''
3882 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3883 ([.\s]*?)data-playlist_item_id="(\d+)"
3884 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3885 '''
3886 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3887 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3888 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3889 m_names=re.finditer(video_name_RE,webpage)
3890
3891 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3892 m_playlist = re.search(playlist_RE, webpage)
3893 playlist_title = m_playlist.group('playlist_title')
3894
3895 playlist_entries = []
3896 for m_video, m_name in zip(m_videos,m_names):
3897 video_id=m_video.group('video_id')
3898 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3899 playlist_entries.append(self.url_result(talk_url, 'TED'))
3900 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3901
3902 def _talk_info(self, url, video_id=0):
3903 """Return the video for the talk in the url"""
3904 m=re.match(self._VALID_URL, url,re.VERBOSE)
3905 videoName=m.group('name')
3906 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3907 # If the url includes the language we get the title translated
3908 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3909 title=re.search(title_RE, webpage).group('title')
3910 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3911 "id":(?P<videoID>[\d]+).*?
3912 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3913 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3914 thumb_match=re.search(thumb_RE,webpage)
3915 info_match=re.search(info_RE,webpage,re.VERBOSE)
3916 video_id=info_match.group('videoID')
3917 mediaSlug=info_match.group('mediaSlug')
3918 video_url=self._talk_video_link(mediaSlug)
3919 info = {
3920 'id': video_id,
3921 'url': video_url,
3922 'ext': 'mp4',
3923 'title': title,
3924 'thumbnail': thumb_match.group('thumbnail')
3925 }
3926 return info
3927
3928 class MySpassIE(InfoExtractor):
3929 _VALID_URL = r'http://www.myspass.de/.*'
3930
3931 def _real_extract(self, url):
3932 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3933
3934 # video id is the last path element of the URL
3935 # usually there is a trailing slash, so also try the second but last
3936 url_path = compat_urllib_parse_urlparse(url).path
3937 url_parent_path, video_id = os.path.split(url_path)
3938 if not video_id:
3939 _, video_id = os.path.split(url_parent_path)
3940
3941 # get metadata
3942 metadata_url = META_DATA_URL_TEMPLATE % video_id
3943 metadata_text = self._download_webpage(metadata_url, video_id)
3944 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3945
3946 # extract values from metadata
3947 url_flv_el = metadata.find('url_flv')
3948 if url_flv_el is None:
3949 self._downloader.report_error(u'unable to extract download url')
3950 return
3951 video_url = url_flv_el.text
3952 extension = os.path.splitext(video_url)[1][1:]
3953 title_el = metadata.find('title')
3954 if title_el is None:
3955 self._downloader.report_error(u'unable to extract title')
3956 return
3957 title = title_el.text
3958 format_id_el = metadata.find('format_id')
3959 if format_id_el is None:
3960 format = ext
3961 else:
3962 format = format_id_el.text
3963 description_el = metadata.find('description')
3964 if description_el is not None:
3965 description = description_el.text
3966 else:
3967 description = None
3968 imagePreview_el = metadata.find('imagePreview')
3969 if imagePreview_el is not None:
3970 thumbnail = imagePreview_el.text
3971 else:
3972 thumbnail = None
3973 info = {
3974 'id': video_id,
3975 'url': video_url,
3976 'title': title,
3977 'ext': extension,
3978 'format': format,
3979 'thumbnail': thumbnail,
3980 'description': description
3981 }
3982 return [info]
3983
3984 class SpiegelIE(InfoExtractor):
3985 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3986
3987 def _real_extract(self, url):
3988 m = re.match(self._VALID_URL, url)
3989 video_id = m.group('videoID')
3990
3991 webpage = self._download_webpage(url, video_id)
3992 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3993 if not m:
3994 raise ExtractorError(u'Cannot find title')
3995 video_title = unescapeHTML(m.group(1))
3996
3997 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3998 xml_code = self._download_webpage(xml_url, video_id,
3999 note=u'Downloading XML', errnote=u'Failed to download XML')
4000
4001 idoc = xml.etree.ElementTree.fromstring(xml_code)
4002 last_type = idoc[-1]
4003 filename = last_type.findall('./filename')[0].text
4004 duration = float(last_type.findall('./duration')[0].text)
4005
4006 video_url = 'http://video2.spiegel.de/flash/' + filename
4007 video_ext = filename.rpartition('.')[2]
4008 info = {
4009 'id': video_id,
4010 'url': video_url,
4011 'ext': video_ext,
4012 'title': video_title,
4013 'duration': duration,
4014 }
4015 return [info]
4016
4017 class LiveLeakIE(InfoExtractor):
4018
4019 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4020 IE_NAME = u'liveleak'
4021
4022 def _real_extract(self, url):
4023 mobj = re.match(self._VALID_URL, url)
4024 if mobj is None:
4025 self._downloader.report_error(u'invalid URL: %s' % url)
4026 return
4027
4028 video_id = mobj.group('video_id')
4029
4030 webpage = self._download_webpage(url, video_id)
4031
4032 m = re.search(r'file: "(.*?)",', webpage)
4033 if not m:
4034 self._downloader.report_error(u'unable to find video url')
4035 return
4036 video_url = m.group(1)
4037
4038 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4039 if not m:
4040 self._downloader.report_error(u'Cannot find video title')
4041 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4042
4043 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4044 if m:
4045 desc = unescapeHTML(m.group('desc'))
4046 else:
4047 desc = None
4048
4049 m = re.search(r'By:.*?(\w+)</a>', webpage)
4050 if m:
4051 uploader = clean_html(m.group(1))
4052 else:
4053 uploader = None
4054
4055 info = {
4056 'id': video_id,
4057 'url': video_url,
4058 'ext': 'mp4',
4059 'title': title,
4060 'description': desc,
4061 'uploader': uploader
4062 }
4063
4064 return [info]
4065
4066 class ARDIE(InfoExtractor):
4067 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4068 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4069 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4070
4071 def _real_extract(self, url):
4072 # determine video id from url
4073 m = re.match(self._VALID_URL, url)
4074
4075 numid = re.search(r'documentId=([0-9]+)', url)
4076 if numid:
4077 video_id = numid.group(1)
4078 else:
4079 video_id = m.group('video_id')
4080
4081 # determine title and media streams from webpage
4082 html = self._download_webpage(url, video_id)
4083 title = re.search(self._TITLE, html).group('title')
4084 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4085 if not streams:
4086 assert '"fsk"' in html
4087 self._downloader.report_error(u'this video is only available after 8:00 pm')
4088 return
4089
4090 # choose default media type and highest quality for now
4091 stream = max([s for s in streams if int(s["media_type"]) == 0],
4092 key=lambda s: int(s["quality"]))
4093
4094 # there's two possibilities: RTMP stream or HTTP download
4095 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4096 if stream['rtmp_url']:
4097 self.to_screen(u'RTMP download detected')
4098 assert stream['video_url'].startswith('mp4:')
4099 info["url"] = stream["rtmp_url"]
4100 info["play_path"] = stream['video_url']
4101 else:
4102 assert stream["video_url"].endswith('.mp4')
4103 info["url"] = stream["video_url"]
4104 return [info]
4105
4106 class TumblrIE(InfoExtractor):
4107 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4108
4109 def _real_extract(self, url):
4110 m_url = re.match(self._VALID_URL, url)
4111 video_id = m_url.group('id')
4112 blog = m_url.group('blog_name')
4113
4114 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4115 webpage = self._download_webpage(url, video_id)
4116
4117 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4118 video = re.search(re_video, webpage)
4119 if video is None:
4120 self.to_screen("No video founded")
4121 return []
4122 video_url = video.group('video_url')
4123 ext = video.group('ext')
4124
4125 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4126 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4127
4128 # The only place where you can get a title, it's not complete,
4129 # but searching in other places doesn't work for all videos
4130 re_title = r'<title>(?P<title>.*?)</title>'
4131 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4132
4133 return [{'id': video_id,
4134 'url': video_url,
4135 'title': title,
4136 'thumbnail': thumb,
4137 'ext': ext
4138 }]
4139
4140 class BandcampIE(InfoExtractor):
4141 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4142
4143 def _real_extract(self, url):
4144 mobj = re.match(self._VALID_URL, url)
4145 title = mobj.group('title')
4146 webpage = self._download_webpage(url, title)
4147 # We get the link to the free download page
4148 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4149 if m_download is None:
4150 self._downloader.report_error('No free songs founded')
4151 return
4152 download_link = m_download.group(1)
4153 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4154 webpage, re.MULTILINE|re.DOTALL).group('id')
4155
4156 download_webpage = self._download_webpage(download_link, id,
4157 'Downloading free downloads page')
4158 # We get the dictionary of the track from some javascrip code
4159 info = re.search(r'items: (.*?),$',
4160 download_webpage, re.MULTILINE).group(1)
4161 info = json.loads(info)[0]
4162 # We pick mp3-320 for now, until format selection can be easily implemented.
4163 mp3_info = info[u'downloads'][u'mp3-320']
4164 # If we try to use this url it says the link has expired
4165 initial_url = mp3_info[u'url']
4166 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4167 m_url = re.match(re_url, initial_url)
4168 #We build the url we will use to get the final track url
4169 # This url is build in Bandcamp in the script download_bunde_*.js
4170 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4171 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4172 # If we could correctly generate the .rand field the url would be
4173 #in the "download_url" key
4174 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4175
4176 track_info = {'id':id,
4177 'title' : info[u'title'],
4178 'ext' : 'mp3',
4179 'url' : final_url,
4180 'thumbnail' : info[u'thumb_url'],
4181 'uploader' : info[u'artist']
4182 }
4183
4184 return [track_info]
4185
4186
4187 def gen_extractors():
4188 """ Return a list of an instance of every supported extractor.
4189 The order does matter; the first extractor matched is the one handling the URL.
4190 """
4191 return [
4192 YoutubePlaylistIE(),
4193 YoutubeChannelIE(),
4194 YoutubeUserIE(),
4195 YoutubeSearchIE(),
4196 YoutubeIE(),
4197 MetacafeIE(),
4198 DailymotionIE(),
4199 GoogleSearchIE(),
4200 PhotobucketIE(),
4201 YahooIE(),
4202 YahooSearchIE(),
4203 DepositFilesIE(),
4204 FacebookIE(),
4205 BlipTVUserIE(),
4206 BlipTVIE(),
4207 VimeoIE(),
4208 MyVideoIE(),
4209 ComedyCentralIE(),
4210 EscapistIE(),
4211 CollegeHumorIE(),
4212 XVideosIE(),
4213 SoundcloudSetIE(),
4214 SoundcloudIE(),
4215 InfoQIE(),
4216 MixcloudIE(),
4217 StanfordOpenClassroomIE(),
4218 MTVIE(),
4219 YoukuIE(),
4220 XNXXIE(),
4221 YouJizzIE(),
4222 PornotubeIE(),
4223 YouPornIE(),
4224 GooglePlusIE(),
4225 ArteTvIE(),
4226 NBAIE(),
4227 WorldStarHipHopIE(),
4228 JustinTVIE(),
4229 FunnyOrDieIE(),
4230 SteamIE(),
4231 UstreamIE(),
4232 RBMARadioIE(),
4233 EightTracksIE(),
4234 KeekIE(),
4235 TEDIE(),
4236 MySpassIE(),
4237 SpiegelIE(),
4238 LiveLeakIE(),
4239 ARDIE(),
4240 TumblrIE(),
4241 BandcampIE(),
4242 GenericIE()
4243 ]
4244
4245 def get_info_extractor(ie_name):
4246 """Returns the info extractor class with the given ie_name"""
4247 return globals()[ie_name+'IE']