]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Modified Youtube video/playlist matching; fixes #668; fixes #585
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119 try:
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122 if errnote is None:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 webpage_bytes = urlh.read()
130 return webpage_bytes.decode('utf-8', 'replace')
131
132
133 class YoutubeIE(InfoExtractor):
134 """Information extractor for youtube.com."""
135
136 _VALID_URL = r"""^
137 (
138 (?:https?://)? # http(s):// (optional)
139 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
150 )? # optional -> youtube.com/xxxx is OK
151 )? # all until now is optional -> you can pass the naked ID
152 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
153 (?(1).+)? # if we found the ID, everything can follow
154 $"""
155 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _NETRC_MACHINE = 'youtube'
160 # Listed in order of quality
161 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163 _video_extensions = {
164 '13': '3gp',
165 '17': 'mp4',
166 '18': 'mp4',
167 '22': 'mp4',
168 '37': 'mp4',
169 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
170 '43': 'webm',
171 '44': 'webm',
172 '45': 'webm',
173 '46': 'webm',
174 }
175 _video_dimensions = {
176 '5': '240x400',
177 '6': '???',
178 '13': '???',
179 '17': '144x176',
180 '18': '360x640',
181 '22': '720x1280',
182 '34': '360x640',
183 '35': '480x854',
184 '37': '1080x1920',
185 '38': '3072x4096',
186 '43': '360x640',
187 '44': '480x854',
188 '45': '720x1280',
189 '46': '1080x1920',
190 }
191 IE_NAME = u'youtube'
192
193 @classmethod
194 def suitable(cls, url):
195 """Receives a URL and returns True if suitable for this IE."""
196 if YoutubePlaylistIE.suitable(url): return False
197 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
198
199 def report_lang(self):
200 """Report attempt to set language."""
201 self._downloader.to_screen(u'[youtube] Setting language')
202
203 def report_login(self):
204 """Report attempt to log in."""
205 self._downloader.to_screen(u'[youtube] Logging in')
206
207 def report_age_confirmation(self):
208 """Report attempt to confirm age."""
209 self._downloader.to_screen(u'[youtube] Confirming age')
210
211 def report_video_webpage_download(self, video_id):
212 """Report attempt to download video webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
214
215 def report_video_info_webpage_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
218
219 def report_video_subtitles_download(self, video_id):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
222
223 def report_information_extraction(self, video_id):
224 """Report attempt to extract video information."""
225 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227 def report_unavailable_format(self, video_id, format):
228 """Report extracted video URL."""
229 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231 def report_rtmp_download(self):
232 """Indicate the download will use the RTMP protocol."""
233 self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235 def _closed_captions_xml_to_srt(self, xml_string):
236 srt = ''
237 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238 # TODO parse xml instead of regex
239 for n, (start, dur_tag, dur, caption) in enumerate(texts):
240 if not dur: dur = '4'
241 start = float(start)
242 end = start + float(dur)
243 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245 caption = unescapeHTML(caption)
246 caption = unescapeHTML(caption) # double cycle, intentional
247 srt += str(n+1) + '\n'
248 srt += start + ' --> ' + end + '\n'
249 srt += caption + '\n\n'
250 return srt
251
252 def _extract_subtitles(self, video_id):
253 self.report_video_subtitles_download(video_id)
254 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
255 try:
256 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261 if not srt_lang_list:
262 return (u'WARNING: video has no closed captions', None)
263 if self._downloader.params.get('subtitleslang', False):
264 srt_lang = self._downloader.params.get('subtitleslang')
265 elif 'en' in srt_lang_list:
266 srt_lang = 'en'
267 else:
268 srt_lang = list(srt_lang_list.keys())[0]
269 if not srt_lang in srt_lang_list:
270 return (u'WARNING: no closed captions found in the specified language', None)
271 params = compat_urllib_parse.urlencode({
272 'lang': srt_lang,
273 'name': srt_lang_list[srt_lang].encode('utf-8'),
274 'v': video_id,
275 })
276 url = 'http://www.youtube.com/api/timedtext?' + params
277 try:
278 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281 if not srt_xml:
282 return (u'WARNING: Did not fetch video subtitles', None)
283 return (None, self._closed_captions_xml_to_srt(srt_xml))
284
285 def _print_formats(self, formats):
286 print('Available formats:')
287 for x in formats:
288 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
289
290 def _real_initialize(self):
291 if self._downloader is None:
292 return
293
294 username = None
295 password = None
296 downloader_params = self._downloader.params
297
298 # Attempt to use provided username and password or .netrc data
299 if downloader_params.get('username', None) is not None:
300 username = downloader_params['username']
301 password = downloader_params['password']
302 elif downloader_params.get('usenetrc', False):
303 try:
304 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 if info is not None:
306 username = info[0]
307 password = info[2]
308 else:
309 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310 except (IOError, netrc.NetrcParseError) as err:
311 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
312 return
313
314 # Set language
315 request = compat_urllib_request.Request(self._LANG_URL)
316 try:
317 self.report_lang()
318 compat_urllib_request.urlopen(request).read()
319 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
321 return
322
323 # No authentication to be performed
324 if username is None:
325 return
326
327 request = compat_urllib_request.Request(self._LOGIN_URL)
328 try:
329 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332 return
333
334 galx = None
335 dsh = None
336 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
337 if match:
338 galx = match.group(1)
339
340 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
341 if match:
342 dsh = match.group(1)
343
344 # Log in
345 login_form_strs = {
346 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
347 u'Email': username,
348 u'GALX': galx,
349 u'Passwd': password,
350 u'PersistentCookie': u'yes',
351 u'_utf8': u'霱',
352 u'bgresponse': u'js_disabled',
353 u'checkConnection': u'',
354 u'checkedDomains': u'youtube',
355 u'dnConn': u'',
356 u'dsh': dsh,
357 u'pstMsg': u'0',
358 u'rmShown': u'1',
359 u'secTok': u'',
360 u'signIn': u'Sign in',
361 u'timeStmp': u'',
362 u'service': u'youtube',
363 u'uilel': u'3',
364 u'hl': u'en_US',
365 }
366 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
367 # chokes on unicode
368 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
371 try:
372 self.report_login()
373 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
376 return
377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
379 return
380
381 # Confirm age
382 age_form = {
383 'next_url': '/',
384 'action_confirm': 'Confirm',
385 }
386 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
387 try:
388 self.report_age_confirmation()
389 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
392 return
393
394 def _extract_id(self, url):
395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
396 if mobj is None:
397 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
398 return
399 video_id = mobj.group(2)
400 return video_id
401
402 def _real_extract(self, url):
403 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404 mobj = re.search(self._NEXT_URL_RE, url)
405 if mobj:
406 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407 video_id = self._extract_id(url)
408
409 # Get video webpage
410 self.report_video_webpage_download(video_id)
411 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412 request = compat_urllib_request.Request(url)
413 try:
414 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
417 return
418
419 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
420
421 # Attempt to extract SWF player URL
422 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
423 if mobj is not None:
424 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 else:
426 player_url = None
427
428 # Get video info
429 self.report_video_info_webpage_download(video_id)
430 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432 % (video_id, el_type))
433 request = compat_urllib_request.Request(video_info_url)
434 try:
435 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437 video_info = compat_parse_qs(video_info_webpage)
438 if 'token' in video_info:
439 break
440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
442 return
443 if 'token' not in video_info:
444 if 'reason' in video_info:
445 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
446 else:
447 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
448 return
449
450 # Check for "rental" videos
451 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452 self._downloader.trouble(u'ERROR: "rental" videos not supported')
453 return
454
455 # Start extracting information
456 self.report_information_extraction(video_id)
457
458 # uploader
459 if 'author' not in video_info:
460 self._downloader.trouble(u'ERROR: unable to extract uploader name')
461 return
462 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
463
464 # uploader_id
465 video_uploader_id = None
466 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
467 if mobj is not None:
468 video_uploader_id = mobj.group(1)
469 else:
470 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
471
472 # title
473 if 'title' not in video_info:
474 self._downloader.trouble(u'ERROR: unable to extract video title')
475 return
476 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
477
478 # thumbnail image
479 if 'thumbnail_url' not in video_info:
480 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
481 video_thumbnail = ''
482 else: # don't panic if we can't find it
483 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
484
485 # upload date
486 upload_date = None
487 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
488 if mobj is not None:
489 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491 for expression in format_expressions:
492 try:
493 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 except:
495 pass
496
497 # description
498 video_description = get_element_by_id("eow-description", video_webpage)
499 if video_description:
500 video_description = clean_html(video_description)
501 else:
502 video_description = ''
503
504 # closed captions
505 video_subtitles = None
506 if self._downloader.params.get('writesubtitles', False):
507 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
508 if srt_error:
509 self._downloader.trouble(srt_error)
510
511 if 'length_seconds' not in video_info:
512 self._downloader.trouble(u'WARNING: unable to extract video duration')
513 video_duration = ''
514 else:
515 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
516
517 # token
518 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
519
520 # Decide which formats to download
521 req_format = self._downloader.params.get('format', None)
522
523 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524 self.report_rtmp_download()
525 video_url_list = [(None, video_info['conn'][0])]
526 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
531
532 format_limit = self._downloader.params.get('format_limit', None)
533 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534 if format_limit is not None and format_limit in available_formats:
535 format_list = available_formats[available_formats.index(format_limit):]
536 else:
537 format_list = available_formats
538 existing_formats = [x for x in format_list if x in url_map]
539 if len(existing_formats) == 0:
540 self._downloader.trouble(u'ERROR: no known formats available for video')
541 return
542 if self._downloader.params.get('listformats', None):
543 self._print_formats(existing_formats)
544 return
545 if req_format is None or req_format == 'best':
546 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547 elif req_format == 'worst':
548 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549 elif req_format in ('-1', 'all'):
550 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
551 else:
552 # Specific formats. We pick the first in a slash-delimeted sequence.
553 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554 req_formats = req_format.split('/')
555 video_url_list = None
556 for rf in req_formats:
557 if rf in url_map:
558 video_url_list = [(rf, url_map[rf])]
559 break
560 if video_url_list is None:
561 self._downloader.trouble(u'ERROR: requested format not available')
562 return
563 else:
564 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
565 return
566
567 results = []
568 for format_param, video_real_url in video_url_list:
569 # Extension
570 video_extension = self._video_extensions.get(format_param, 'flv')
571
572 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573 self._video_dimensions.get(format_param, '???'))
574
575 results.append({
576 'id': video_id,
577 'url': video_real_url,
578 'uploader': video_uploader,
579 'uploader_id': video_uploader_id,
580 'upload_date': upload_date,
581 'title': video_title,
582 'ext': video_extension,
583 'format': video_format,
584 'thumbnail': video_thumbnail,
585 'description': video_description,
586 'player_url': player_url,
587 'subtitles': video_subtitles,
588 'duration': video_duration
589 })
590 return results
591
592
593 class MetacafeIE(InfoExtractor):
594 """Information Extractor for metacafe.com."""
595
596 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599 IE_NAME = u'metacafe'
600
601 def __init__(self, downloader=None):
602 InfoExtractor.__init__(self, downloader)
603
604 def report_disclaimer(self):
605 """Report disclaimer retrieval."""
606 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
607
608 def report_age_confirmation(self):
609 """Report attempt to confirm age."""
610 self._downloader.to_screen(u'[metacafe] Confirming age')
611
612 def report_download_webpage(self, video_id):
613 """Report webpage download."""
614 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
615
616 def report_extraction(self, video_id):
617 """Report information extraction."""
618 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
619
620 def _real_initialize(self):
621 # Retrieve disclaimer
622 request = compat_urllib_request.Request(self._DISCLAIMER)
623 try:
624 self.report_disclaimer()
625 disclaimer = compat_urllib_request.urlopen(request).read()
626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
628 return
629
630 # Confirm age
631 disclaimer_form = {
632 'filters': '0',
633 'submit': "Continue - I'm over 18",
634 }
635 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
636 try:
637 self.report_age_confirmation()
638 disclaimer = compat_urllib_request.urlopen(request).read()
639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
641 return
642
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
646 if mobj is None:
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648 return
649
650 video_id = mobj.group(1)
651
652 # Check if video comes from YouTube
653 mobj2 = re.match(r'^yt-(.*)$', video_id)
654 if mobj2 is not None:
655 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
656 return
657
658 # Retrieve video webpage to extract further information
659 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
660 try:
661 self.report_download_webpage(video_id)
662 webpage = compat_urllib_request.urlopen(request).read()
663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665 return
666
667 # Extract URL, uploader and title from webpage
668 self.report_extraction(video_id)
669 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
670 if mobj is not None:
671 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672 video_extension = mediaURL[-3:]
673
674 # Extract gdaKey if available
675 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
676 if mobj is None:
677 video_url = mediaURL
678 else:
679 gdaKey = mobj.group(1)
680 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
681 else:
682 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
683 if mobj is None:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
685 return
686 vardict = compat_parse_qs(mobj.group(1))
687 if 'mediaData' not in vardict:
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
689 return
690 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
691 if mobj is None:
692 self._downloader.trouble(u'ERROR: unable to extract media URL')
693 return
694 mediaURL = mobj.group(1).replace('\\/', '/')
695 video_extension = mediaURL[-3:]
696 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
697
698 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
699 if mobj is None:
700 self._downloader.trouble(u'ERROR: unable to extract title')
701 return
702 video_title = mobj.group(1).decode('utf-8')
703
704 mobj = re.search(r'submitter=(.*?);', webpage)
705 if mobj is None:
706 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
707 return
708 video_uploader = mobj.group(1)
709
710 return [{
711 'id': video_id.decode('utf-8'),
712 'url': video_url.decode('utf-8'),
713 'uploader': video_uploader.decode('utf-8'),
714 'upload_date': None,
715 'title': video_title,
716 'ext': video_extension.decode('utf-8'),
717 }]
718
719
720 class DailymotionIE(InfoExtractor):
721 """Information Extractor for Dailymotion"""
722
723 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724 IE_NAME = u'dailymotion'
725 _WORKING = False
726
727 def __init__(self, downloader=None):
728 InfoExtractor.__init__(self, downloader)
729
730 def report_extraction(self, video_id):
731 """Report information extraction."""
732 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
733
734 def _real_extract(self, url):
735 # Extract id and simplified title from URL
736 mobj = re.match(self._VALID_URL, url)
737 if mobj is None:
738 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
739 return
740
741 video_id = mobj.group(1).split('_')[0].split('?')[0]
742
743 video_extension = 'mp4'
744
745 # Retrieve video webpage to extract further information
746 request = compat_urllib_request.Request(url)
747 request.add_header('Cookie', 'family_filter=off')
748 webpage = self._download_webpage(request, video_id)
749
750 # Extract URL, uploader and title from webpage
751 self.report_extraction(video_id)
752 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
753 if mobj is None:
754 self._downloader.trouble(u'ERROR: unable to extract media URL')
755 return
756 flashvars = compat_urllib_parse.unquote(mobj.group(1))
757
758 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
759 if key in flashvars:
760 max_quality = key
761 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
762 break
763 else:
764 self._downloader.trouble(u'ERROR: unable to extract video URL')
765 return
766
767 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
768 if mobj is None:
769 self._downloader.trouble(u'ERROR: unable to extract video URL')
770 return
771
772 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
773
774 # TODO: support choosing qualities
775
776 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
777 if mobj is None:
778 self._downloader.trouble(u'ERROR: unable to extract title')
779 return
780 video_title = unescapeHTML(mobj.group('title'))
781
782 video_uploader = None
783 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
784 if mobj is None:
785 # lookin for official user
786 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787 if mobj_official is None:
788 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
789 else:
790 video_uploader = mobj_official.group(1)
791 else:
792 video_uploader = mobj.group(1)
793
794 video_upload_date = None
795 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
796 if mobj is not None:
797 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798
799 return [{
800 'id': video_id,
801 'url': video_url,
802 'uploader': video_uploader,
803 'upload_date': video_upload_date,
804 'title': video_title,
805 'ext': video_extension,
806 }]
807
808
809 class PhotobucketIE(InfoExtractor):
810 """Information extractor for photobucket.com."""
811
812 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813 IE_NAME = u'photobucket'
814
815 def __init__(self, downloader=None):
816 InfoExtractor.__init__(self, downloader)
817
818 def report_download_webpage(self, video_id):
819 """Report webpage download."""
820 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
821
822 def report_extraction(self, video_id):
823 """Report information extraction."""
824 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
825
826 def _real_extract(self, url):
827 # Extract id from URL
828 mobj = re.match(self._VALID_URL, url)
829 if mobj is None:
830 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
831 return
832
833 video_id = mobj.group(1)
834
835 video_extension = 'flv'
836
837 # Retrieve video webpage to extract further information
838 request = compat_urllib_request.Request(url)
839 try:
840 self.report_download_webpage(video_id)
841 webpage = compat_urllib_request.urlopen(request).read()
842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
844 return
845
846 # Extract URL, uploader, and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
849 if mobj is None:
850 self._downloader.trouble(u'ERROR: unable to extract media URL')
851 return
852 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
853
854 video_url = mediaURL
855
856 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
857 if mobj is None:
858 self._downloader.trouble(u'ERROR: unable to extract title')
859 return
860 video_title = mobj.group(1).decode('utf-8')
861
862 video_uploader = mobj.group(2).decode('utf-8')
863
864 return [{
865 'id': video_id.decode('utf-8'),
866 'url': video_url.decode('utf-8'),
867 'uploader': video_uploader,
868 'upload_date': None,
869 'title': video_title,
870 'ext': video_extension.decode('utf-8'),
871 }]
872
873
874 class YahooIE(InfoExtractor):
875 """Information extractor for video.yahoo.com."""
876
877 _WORKING = False
878 # _VALID_URL matches all Yahoo! Video URLs
879 # _VPAGE_URL matches only the extractable '/watch/' URLs
880 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882 IE_NAME = u'video.yahoo'
883
884 def __init__(self, downloader=None):
885 InfoExtractor.__init__(self, downloader)
886
887 def report_download_webpage(self, video_id):
888 """Report webpage download."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
890
891 def report_extraction(self, video_id):
892 """Report information extraction."""
893 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
894
895 def _real_extract(self, url, new_video=True):
896 # Extract ID from URL
897 mobj = re.match(self._VALID_URL, url)
898 if mobj is None:
899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
900 return
901
902 video_id = mobj.group(2)
903 video_extension = 'flv'
904
905 # Rewrite valid but non-extractable URLs as
906 # extractable English language /watch/ URLs
907 if re.match(self._VPAGE_URL, url) is None:
908 request = compat_urllib_request.Request(url)
909 try:
910 webpage = compat_urllib_request.urlopen(request).read()
911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
913 return
914
915 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
916 if mobj is None:
917 self._downloader.trouble(u'ERROR: Unable to extract id field')
918 return
919 yahoo_id = mobj.group(1)
920
921 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
922 if mobj is None:
923 self._downloader.trouble(u'ERROR: Unable to extract vid field')
924 return
925 yahoo_vid = mobj.group(1)
926
927 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928 return self._real_extract(url, new_video=False)
929
930 # Retrieve video webpage to extract further information
931 request = compat_urllib_request.Request(url)
932 try:
933 self.report_download_webpage(video_id)
934 webpage = compat_urllib_request.urlopen(request).read()
935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
937 return
938
939 # Extract uploader and title from webpage
940 self.report_extraction(video_id)
941 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
942 if mobj is None:
943 self._downloader.trouble(u'ERROR: unable to extract video title')
944 return
945 video_title = mobj.group(1).decode('utf-8')
946
947 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
948 if mobj is None:
949 self._downloader.trouble(u'ERROR: unable to extract video uploader')
950 return
951 video_uploader = mobj.group(1).decode('utf-8')
952
953 # Extract video thumbnail
954 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
955 if mobj is None:
956 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
957 return
958 video_thumbnail = mobj.group(1).decode('utf-8')
959
960 # Extract video description
961 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
962 if mobj is None:
963 self._downloader.trouble(u'ERROR: unable to extract video description')
964 return
965 video_description = mobj.group(1).decode('utf-8')
966 if not video_description:
967 video_description = 'No description available.'
968
969 # Extract video height and width
970 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
971 if mobj is None:
972 self._downloader.trouble(u'ERROR: unable to extract video height')
973 return
974 yv_video_height = mobj.group(1)
975
976 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
977 if mobj is None:
978 self._downloader.trouble(u'ERROR: unable to extract video width')
979 return
980 yv_video_width = mobj.group(1)
981
982 # Retrieve video playlist to extract media URL
983 # I'm not completely sure what all these options are, but we
984 # seem to need most of them, otherwise the server sends a 401.
985 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
986 yv_bitrate = '700' # according to Wikipedia this is hard-coded
987 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
990 try:
991 self.report_download_webpage(video_id)
992 webpage = compat_urllib_request.urlopen(request).read()
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995 return
996
997 # Extract media URL from playlist XML
998 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
999 if mobj is None:
1000 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001 return
1002 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003 video_url = unescapeHTML(video_url)
1004
1005 return [{
1006 'id': video_id.decode('utf-8'),
1007 'url': video_url,
1008 'uploader': video_uploader,
1009 'upload_date': None,
1010 'title': video_title,
1011 'ext': video_extension.decode('utf-8'),
1012 'thumbnail': video_thumbnail.decode('utf-8'),
1013 'description': video_description,
1014 }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1019
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022 IE_NAME = u'vimeo'
1023
1024 def __init__(self, downloader=None):
1025 InfoExtractor.__init__(self, downloader)
1026
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035 def _real_extract(self, url, new_video=True):
1036 # Extract ID from URL
1037 mobj = re.match(self._VALID_URL, url)
1038 if mobj is None:
1039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040 return
1041
1042 video_id = mobj.group('id')
1043 if not mobj.group('proto'):
1044 url = 'https://' + url
1045 if mobj.group('direct_link'):
1046 url = 'https://vimeo.com/' + video_id
1047
1048 # Retrieve video webpage to extract further information
1049 request = compat_urllib_request.Request(url, None, std_headers)
1050 try:
1051 self.report_download_webpage(video_id)
1052 webpage_bytes = compat_urllib_request.urlopen(request).read()
1053 webpage = webpage_bytes.decode('utf-8')
1054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056 return
1057
1058 # Now we begin extracting as much information as we can from what we
1059 # retrieved. First we extract the information common to all extractors,
1060 # and latter we extract those that are Vimeo specific.
1061 self.report_extraction(video_id)
1062
1063 # Extract the config JSON
1064 try:
1065 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066 config = json.loads(config)
1067 except:
1068 self._downloader.trouble(u'ERROR: unable to extract info section')
1069 return
1070
1071 # Extract title
1072 video_title = config["video"]["title"]
1073
1074 # Extract uploader and uploader_id
1075 video_uploader = config["video"]["owner"]["name"]
1076 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078 # Extract video thumbnail
1079 video_thumbnail = config["video"]["thumbnail"]
1080
1081 # Extract video description
1082 video_description = get_element_by_attribute("itemprop", "description", webpage)
1083 if video_description: video_description = clean_html(video_description)
1084 else: video_description = ''
1085
1086 # Extract upload date
1087 video_upload_date = None
1088 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089 if mobj is not None:
1090 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092 # Vimeo specific: extract request signature and timestamp
1093 sig = config['request']['signature']
1094 timestamp = config['request']['timestamp']
1095
1096 # Vimeo specific: extract video codec and quality information
1097 # First consider quality, then codecs, then take everything
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 files = { 'hd': [], 'sd': [], 'other': []}
1101 for codec_name, codec_extension in codecs:
1102 if codec_name in config["video"]["files"]:
1103 if 'hd' in config["video"]["files"][codec_name]:
1104 files['hd'].append((codec_name, codec_extension, 'hd'))
1105 elif 'sd' in config["video"]["files"][codec_name]:
1106 files['sd'].append((codec_name, codec_extension, 'sd'))
1107 else:
1108 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110 for quality in ('hd', 'sd', 'other'):
1111 if len(files[quality]) > 0:
1112 video_quality = files[quality][0][2]
1113 video_codec = files[quality][0][0]
1114 video_extension = files[quality][0][1]
1115 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116 break
1117 else:
1118 self._downloader.trouble(u'ERROR: no known codec found')
1119 return
1120
1121 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124 return [{
1125 'id': video_id,
1126 'url': video_url,
1127 'uploader': video_uploader,
1128 'uploader_id': video_uploader_id,
1129 'upload_date': video_upload_date,
1130 'title': video_title,
1131 'ext': video_extension,
1132 'thumbnail': video_thumbnail,
1133 'description': video_description,
1134 }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138 """arte.tv information extractor."""
1139
1140 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141 _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143 IE_NAME = u'arte.tv'
1144
1145 def __init__(self, downloader=None):
1146 InfoExtractor.__init__(self, downloader)
1147
1148 def report_download_webpage(self, video_id):
1149 """Report webpage download."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152 def report_extraction(self, video_id):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156 def fetch_webpage(self, url):
1157 request = compat_urllib_request.Request(url)
1158 try:
1159 self.report_download_webpage(url)
1160 webpage = compat_urllib_request.urlopen(request).read()
1161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163 return
1164 except ValueError as err:
1165 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166 return
1167 return webpage
1168
1169 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170 page = self.fetch_webpage(url)
1171 mobj = re.search(regex, page, regexFlags)
1172 info = {}
1173
1174 if mobj is None:
1175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176 return
1177
1178 for (i, key, err) in matchTuples:
1179 if mobj.group(i) is None:
1180 self._downloader.trouble(err)
1181 return
1182 else:
1183 info[key] = mobj.group(i)
1184
1185 return info
1186
1187 def extractLiveStream(self, url):
1188 video_lang = url.split('/')[-4]
1189 info = self.grep_webpage(
1190 url,
1191 r'src="(.*?/videothek_js.*?\.js)',
1192 0,
1193 [
1194 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195 ]
1196 )
1197 http_host = url.split('/')[2]
1198 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199 info = self.grep_webpage(
1200 next_url,
1201 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202 '(http://.*?\.swf).*?' +
1203 '(rtmp://.*?)\'',
1204 re.DOTALL,
1205 [
1206 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1207 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1209 ]
1210 )
1211 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213 def extractPlus7Stream(self, url):
1214 video_lang = url.split('/')[-3]
1215 info = self.grep_webpage(
1216 url,
1217 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218 0,
1219 [
1220 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221 ]
1222 )
1223 next_url = compat_urllib_parse.unquote(info.get('url'))
1224 info = self.grep_webpage(
1225 next_url,
1226 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227 0,
1228 [
1229 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230 ]
1231 )
1232 next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234 info = self.grep_webpage(
1235 next_url,
1236 r'<video id="(.*?)".*?>.*?' +
1237 '<name>(.*?)</name>.*?' +
1238 '<dateVideo>(.*?)</dateVideo>.*?' +
1239 '<url quality="hd">(.*?)</url>',
1240 re.DOTALL,
1241 [
1242 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1243 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1245 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1246 ]
1247 )
1248
1249 return {
1250 'id': info.get('id'),
1251 'url': compat_urllib_parse.unquote(info.get('url')),
1252 'uploader': u'arte.tv',
1253 'upload_date': info.get('date'),
1254 'title': info.get('title').decode('utf-8'),
1255 'ext': u'mp4',
1256 'format': u'NA',
1257 'player_url': None,
1258 }
1259
1260 def _real_extract(self, url):
1261 video_id = url.split('/')[-1]
1262 self.report_extraction(video_id)
1263
1264 if re.search(self._LIVE_URL, video_id) is not None:
1265 self.extractLiveStream(url)
1266 return
1267 else:
1268 info = self.extractPlus7Stream(url)
1269
1270 return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274 """Generic last-resort information extractor."""
1275
1276 _VALID_URL = r'.*'
1277 IE_NAME = u'generic'
1278
1279 def __init__(self, downloader=None):
1280 InfoExtractor.__init__(self, downloader)
1281
1282 def report_download_webpage(self, video_id):
1283 """Report webpage download."""
1284 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287 def report_extraction(self, video_id):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291 def report_following_redirect(self, new_url):
1292 """Report information extraction."""
1293 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295 def _test_redirect(self, url):
1296 """Check if it is a redirect, like url shorteners, in case restart chain."""
1297 class HeadRequest(compat_urllib_request.Request):
1298 def get_method(self):
1299 return "HEAD"
1300
1301 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302 """
1303 Subclass the HTTPRedirectHandler to make it use our
1304 HeadRequest also on the redirected URL
1305 """
1306 def redirect_request(self, req, fp, code, msg, headers, newurl):
1307 if code in (301, 302, 303, 307):
1308 newurl = newurl.replace(' ', '%20')
1309 newheaders = dict((k,v) for k,v in req.headers.items()
1310 if k.lower() not in ("content-length", "content-type"))
1311 return HeadRequest(newurl,
1312 headers=newheaders,
1313 origin_req_host=req.get_origin_req_host(),
1314 unverifiable=True)
1315 else:
1316 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319 """
1320 Fallback to GET if HEAD is not allowed (405 HTTP error)
1321 """
1322 def http_error_405(self, req, fp, code, msg, headers):
1323 fp.read()
1324 fp.close()
1325
1326 newheaders = dict((k,v) for k,v in req.headers.items()
1327 if k.lower() not in ("content-length", "content-type"))
1328 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329 headers=newheaders,
1330 origin_req_host=req.get_origin_req_host(),
1331 unverifiable=True))
1332
1333 # Build our opener
1334 opener = compat_urllib_request.OpenerDirector()
1335 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336 HTTPMethodFallback, HEADRedirectHandler,
1337 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338 opener.add_handler(handler())
1339
1340 response = opener.open(HeadRequest(url))
1341 new_url = response.geturl()
1342
1343 if url == new_url:
1344 return False
1345
1346 self.report_following_redirect(new_url)
1347 self._downloader.download([new_url])
1348 return True
1349
1350 def _real_extract(self, url):
1351 if self._test_redirect(url): return
1352
1353 video_id = url.split('/')[-1]
1354 request = compat_urllib_request.Request(url)
1355 try:
1356 self.report_download_webpage(video_id)
1357 webpage = compat_urllib_request.urlopen(request).read()
1358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360 return
1361 except ValueError as err:
1362 # since this is the last-resort InfoExtractor, if
1363 # this error is thrown, it'll be thrown here
1364 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365 return
1366
1367 self.report_extraction(video_id)
1368 # Start with something easy: JW Player in SWFObject
1369 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370 if mobj is None:
1371 # Broaden the search a little bit
1372 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373 if mobj is None:
1374 # Broaden the search a little bit: JWPlayer JS loader
1375 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376 if mobj is None:
1377 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378 return
1379
1380 # It's possible that one of the regexes
1381 # matched, but returned an empty group:
1382 if mobj.group(1) is None:
1383 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384 return
1385
1386 video_url = compat_urllib_parse.unquote(mobj.group(1))
1387 video_id = os.path.basename(video_url)
1388
1389 # here's a fun little line of code for you:
1390 video_extension = os.path.splitext(video_id)[1][1:]
1391 video_id = os.path.splitext(video_id)[0]
1392
1393 # it's tempting to parse this further, but you would
1394 # have to take into account all the variations like
1395 # Video Title - Site Name
1396 # Site Name | Video Title
1397 # Video Title - Tagline | Site Name
1398 # and so on and so forth; it's just not practical
1399 mobj = re.search(r'<title>(.*)</title>', webpage)
1400 if mobj is None:
1401 self._downloader.trouble(u'ERROR: unable to extract title')
1402 return
1403 video_title = mobj.group(1)
1404
1405 # video uploader is domain name
1406 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407 if mobj is None:
1408 self._downloader.trouble(u'ERROR: unable to extract title')
1409 return
1410 video_uploader = mobj.group(1)
1411
1412 return [{
1413 'id': video_id,
1414 'url': video_url,
1415 'uploader': video_uploader,
1416 'upload_date': None,
1417 'title': video_title,
1418 'ext': video_extension,
1419 }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423 """Information Extractor for YouTube search queries."""
1424 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426 _max_youtube_results = 1000
1427 IE_NAME = u'youtube:search'
1428
1429 def __init__(self, downloader=None):
1430 InfoExtractor.__init__(self, downloader)
1431
1432 def report_download_page(self, query, pagenum):
1433 """Report attempt to download search page with given number."""
1434 query = query.decode(preferredencoding())
1435 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437 def _real_extract(self, query):
1438 mobj = re.match(self._VALID_URL, query)
1439 if mobj is None:
1440 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441 return
1442
1443 prefix, query = query.split(':')
1444 prefix = prefix[8:]
1445 query = query.encode('utf-8')
1446 if prefix == '':
1447 self._download_n_results(query, 1)
1448 return
1449 elif prefix == 'all':
1450 self._download_n_results(query, self._max_youtube_results)
1451 return
1452 else:
1453 try:
1454 n = int(prefix)
1455 if n <= 0:
1456 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457 return
1458 elif n > self._max_youtube_results:
1459 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460 n = self._max_youtube_results
1461 self._download_n_results(query, n)
1462 return
1463 except ValueError: # parsing prefix as integer fails
1464 self._download_n_results(query, 1)
1465 return
1466
1467 def _download_n_results(self, query, n):
1468 """Downloads a specified number of results for a query"""
1469
1470 video_ids = []
1471 pagenum = 0
1472 limit = n
1473
1474 while (50 * pagenum) < limit:
1475 self.report_download_page(query, pagenum+1)
1476 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477 request = compat_urllib_request.Request(result_url)
1478 try:
1479 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482 return
1483 api_response = json.loads(data)['data']
1484
1485 new_ids = list(video['id'] for video in api_response['items'])
1486 video_ids += new_ids
1487
1488 limit = min(n, api_response['totalItems'])
1489 pagenum += 1
1490
1491 if len(video_ids) > n:
1492 video_ids = video_ids[:n]
1493 for id in video_ids:
1494 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1495 return
1496
1497
1498 class GoogleSearchIE(InfoExtractor):
1499 """Information Extractor for Google Video search queries."""
1500 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504 _max_google_results = 1000
1505 IE_NAME = u'video.google:search'
1506
1507 def __init__(self, downloader=None):
1508 InfoExtractor.__init__(self, downloader)
1509
1510 def report_download_page(self, query, pagenum):
1511 """Report attempt to download playlist page with given number."""
1512 query = query.decode(preferredencoding())
1513 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1514
1515 def _real_extract(self, query):
1516 mobj = re.match(self._VALID_URL, query)
1517 if mobj is None:
1518 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1519 return
1520
1521 prefix, query = query.split(':')
1522 prefix = prefix[8:]
1523 query = query.encode('utf-8')
1524 if prefix == '':
1525 self._download_n_results(query, 1)
1526 return
1527 elif prefix == 'all':
1528 self._download_n_results(query, self._max_google_results)
1529 return
1530 else:
1531 try:
1532 n = int(prefix)
1533 if n <= 0:
1534 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1535 return
1536 elif n > self._max_google_results:
1537 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1538 n = self._max_google_results
1539 self._download_n_results(query, n)
1540 return
1541 except ValueError: # parsing prefix as integer fails
1542 self._download_n_results(query, 1)
1543 return
1544
1545 def _download_n_results(self, query, n):
1546 """Downloads a specified number of results for a query"""
1547
1548 video_ids = []
1549 pagenum = 0
1550
1551 while True:
1552 self.report_download_page(query, pagenum)
1553 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1554 request = compat_urllib_request.Request(result_url)
1555 try:
1556 page = compat_urllib_request.urlopen(request).read()
1557 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1558 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1559 return
1560
1561 # Extract video identifiers
1562 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1563 video_id = mobj.group(1)
1564 if video_id not in video_ids:
1565 video_ids.append(video_id)
1566 if len(video_ids) == n:
1567 # Specified n videos reached
1568 for id in video_ids:
1569 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1570 return
1571
1572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573 for id in video_ids:
1574 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575 return
1576
1577 pagenum = pagenum + 1
1578
1579
1580 class YahooSearchIE(InfoExtractor):
1581 """Information Extractor for Yahoo! Video search queries."""
1582
1583 _WORKING = False
1584 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587 _MORE_PAGES_INDICATOR = r'\s*Next'
1588 _max_yahoo_results = 1000
1589 IE_NAME = u'video.yahoo:search'
1590
1591 def __init__(self, downloader=None):
1592 InfoExtractor.__init__(self, downloader)
1593
1594 def report_download_page(self, query, pagenum):
1595 """Report attempt to download playlist page with given number."""
1596 query = query.decode(preferredencoding())
1597 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1598
1599 def _real_extract(self, query):
1600 mobj = re.match(self._VALID_URL, query)
1601 if mobj is None:
1602 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1603 return
1604
1605 prefix, query = query.split(':')
1606 prefix = prefix[8:]
1607 query = query.encode('utf-8')
1608 if prefix == '':
1609 self._download_n_results(query, 1)
1610 return
1611 elif prefix == 'all':
1612 self._download_n_results(query, self._max_yahoo_results)
1613 return
1614 else:
1615 try:
1616 n = int(prefix)
1617 if n <= 0:
1618 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619 return
1620 elif n > self._max_yahoo_results:
1621 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622 n = self._max_yahoo_results
1623 self._download_n_results(query, n)
1624 return
1625 except ValueError: # parsing prefix as integer fails
1626 self._download_n_results(query, 1)
1627 return
1628
1629 def _download_n_results(self, query, n):
1630 """Downloads a specified number of results for a query"""
1631
1632 video_ids = []
1633 already_seen = set()
1634 pagenum = 1
1635
1636 while True:
1637 self.report_download_page(query, pagenum)
1638 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639 request = compat_urllib_request.Request(result_url)
1640 try:
1641 page = compat_urllib_request.urlopen(request).read()
1642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1644 return
1645
1646 # Extract video identifiers
1647 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648 video_id = mobj.group(1)
1649 if video_id not in already_seen:
1650 video_ids.append(video_id)
1651 already_seen.add(video_id)
1652 if len(video_ids) == n:
1653 # Specified n videos reached
1654 for id in video_ids:
1655 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656 return
1657
1658 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659 for id in video_ids:
1660 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661 return
1662
1663 pagenum = pagenum + 1
1664
1665
1666 class YoutubePlaylistIE(InfoExtractor):
1667 """Information Extractor for YouTube playlists."""
1668
1669 _VALID_URL = r"""(?:
1670 (?:https?://)?
1671 (?:\w+\.)?
1672 youtube\.com/
1673 (?:
1674 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1675 \? (?:.*?&)*? (?:p|a|list)=
1676 | user/.*?/user/
1677 | p/
1678 | user/.*?#[pg]/c/
1679 )
1680 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1681 .*
1682 |
1683 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1684 )"""
1685 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1686 _MAX_RESULTS = 50
1687 IE_NAME = u'youtube:playlist'
1688
1689 def __init__(self, downloader=None):
1690 InfoExtractor.__init__(self, downloader)
1691
1692 @classmethod
1693 def suitable(cls, url):
1694 """Receives a URL and returns True if suitable for this IE."""
1695 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1696
1697 def report_download_page(self, playlist_id, pagenum):
1698 """Report attempt to download playlist page with given number."""
1699 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1700
1701 def _real_extract(self, url):
1702 # Extract playlist id
1703 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1704 if mobj is None:
1705 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1706 return
1707
1708 # Download playlist videos from API
1709 playlist_id = mobj.group(1) or mobj.group(2)
1710 page_num = 1
1711 videos = []
1712
1713 while True:
1714 self.report_download_page(playlist_id, page_num)
1715
1716 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1717 try:
1718 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721 return
1722
1723 try:
1724 response = json.loads(page)
1725 except ValueError as err:
1726 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1727 return
1728
1729 if not 'feed' in response or not 'entry' in response['feed']:
1730 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1731 return
1732 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733 for entry in response['feed']['entry']
1734 if 'content' in entry ]
1735
1736 if len(response['feed']['entry']) < self._MAX_RESULTS:
1737 break
1738 page_num += 1
1739
1740 videos = map(operator.itemgetter(1), sorted(videos))
1741
1742 total = len(videos)
1743
1744 playliststart = self._downloader.params.get('playliststart', 1) - 1
1745 playlistend = self._downloader.params.get('playlistend', -1)
1746 if playlistend == -1:
1747 videos = videos[playliststart:]
1748 else:
1749 videos = videos[playliststart:playlistend]
1750
1751 if len(videos) == total:
1752 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1753 else:
1754 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1755
1756 for video in videos:
1757 self._downloader.download([video])
1758 return
1759
1760
1761 class YoutubeChannelIE(InfoExtractor):
1762 """Information Extractor for YouTube channels."""
1763
1764 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1765 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1766 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1767 IE_NAME = u'youtube:channel'
1768
1769 def report_download_page(self, channel_id, pagenum):
1770 """Report attempt to download channel page with given number."""
1771 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1772
1773 def _real_extract(self, url):
1774 # Extract channel id
1775 mobj = re.match(self._VALID_URL, url)
1776 if mobj is None:
1777 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1778 return
1779
1780 # Download channel pages
1781 channel_id = mobj.group(1)
1782 video_ids = []
1783 pagenum = 1
1784
1785 while True:
1786 self.report_download_page(channel_id, pagenum)
1787 url = self._TEMPLATE_URL % (channel_id, pagenum)
1788 request = compat_urllib_request.Request(url)
1789 try:
1790 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793 return
1794
1795 # Extract video identifiers
1796 ids_in_page = []
1797 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1798 if mobj.group(1) not in ids_in_page:
1799 ids_in_page.append(mobj.group(1))
1800 video_ids.extend(ids_in_page)
1801
1802 if self._MORE_PAGES_INDICATOR not in page:
1803 break
1804 pagenum = pagenum + 1
1805
1806 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1807
1808 for id in video_ids:
1809 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1810 return
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814 """Information Extractor for YouTube users."""
1815
1816 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818 _GDATA_PAGE_SIZE = 50
1819 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821 IE_NAME = u'youtube:user'
1822
1823 def __init__(self, downloader=None):
1824 InfoExtractor.__init__(self, downloader)
1825
1826 def report_download_page(self, username, start_index):
1827 """Report attempt to download user page."""
1828 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1829 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1830
1831 def _real_extract(self, url):
1832 # Extract username
1833 mobj = re.match(self._VALID_URL, url)
1834 if mobj is None:
1835 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1836 return
1837
1838 username = mobj.group(1)
1839
1840 # Download video ids using YouTube Data API. Result size per
1841 # query is limited (currently to 50 videos) so we need to query
1842 # page by page until there are no video ids - it means we got
1843 # all of them.
1844
1845 video_ids = []
1846 pagenum = 0
1847
1848 while True:
1849 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1850 self.report_download_page(username, start_index)
1851
1852 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1853
1854 try:
1855 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1858 return
1859
1860 # Extract video identifiers
1861 ids_in_page = []
1862
1863 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864 if mobj.group(1) not in ids_in_page:
1865 ids_in_page.append(mobj.group(1))
1866
1867 video_ids.extend(ids_in_page)
1868
1869 # A little optimization - if current page is not
1870 # "full", ie. does not contain PAGE_SIZE video ids then
1871 # we can assume that this page is the last one - there
1872 # are no more ids on further pages - no need to query
1873 # again.
1874
1875 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1876 break
1877
1878 pagenum += 1
1879
1880 all_ids_count = len(video_ids)
1881 playliststart = self._downloader.params.get('playliststart', 1) - 1
1882 playlistend = self._downloader.params.get('playlistend', -1)
1883
1884 if playlistend == -1:
1885 video_ids = video_ids[playliststart:]
1886 else:
1887 video_ids = video_ids[playliststart:playlistend]
1888
1889 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1890 (username, all_ids_count, len(video_ids)))
1891
1892 for video_id in video_ids:
1893 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1894
1895
1896 class BlipTVUserIE(InfoExtractor):
1897 """Information Extractor for blip.tv users."""
1898
1899 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1900 _PAGE_SIZE = 12
1901 IE_NAME = u'blip.tv:user'
1902
1903 def __init__(self, downloader=None):
1904 InfoExtractor.__init__(self, downloader)
1905
1906 def report_download_page(self, username, pagenum):
1907 """Report attempt to download user page."""
1908 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1909 (self.IE_NAME, username, pagenum))
1910
1911 def _real_extract(self, url):
1912 # Extract username
1913 mobj = re.match(self._VALID_URL, url)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1916 return
1917
1918 username = mobj.group(1)
1919
1920 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1921
1922 request = compat_urllib_request.Request(url)
1923
1924 try:
1925 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926 mobj = re.search(r'data-users-id="([^"]+)"', page)
1927 page_base = page_base % mobj.group(1)
1928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1930 return
1931
1932
1933 # Download video ids using BlipTV Ajax calls. Result size per
1934 # query is limited (currently to 12 videos) so we need to query
1935 # page by page until there are no video ids - it means we got
1936 # all of them.
1937
1938 video_ids = []
1939 pagenum = 1
1940
1941 while True:
1942 self.report_download_page(username, pagenum)
1943 url = page_base + "&page=" + str(pagenum)
1944 request = compat_urllib_request.Request( url )
1945 try:
1946 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1949 return
1950
1951 # Extract video identifiers
1952 ids_in_page = []
1953
1954 for mobj in re.finditer(r'href="/([^"]+)"', page):
1955 if mobj.group(1) not in ids_in_page:
1956 ids_in_page.append(unescapeHTML(mobj.group(1)))
1957
1958 video_ids.extend(ids_in_page)
1959
1960 # A little optimization - if current page is not
1961 # "full", ie. does not contain PAGE_SIZE video ids then
1962 # we can assume that this page is the last one - there
1963 # are no more ids on further pages - no need to query
1964 # again.
1965
1966 if len(ids_in_page) < self._PAGE_SIZE:
1967 break
1968
1969 pagenum += 1
1970
1971 all_ids_count = len(video_ids)
1972 playliststart = self._downloader.params.get('playliststart', 1) - 1
1973 playlistend = self._downloader.params.get('playlistend', -1)
1974
1975 if playlistend == -1:
1976 video_ids = video_ids[playliststart:]
1977 else:
1978 video_ids = video_ids[playliststart:playlistend]
1979
1980 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1981 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982
1983 for video_id in video_ids:
1984 self._downloader.download([u'http://blip.tv/'+video_id])
1985
1986
1987 class DepositFilesIE(InfoExtractor):
1988 """Information extractor for depositfiles.com"""
1989
1990 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991
1992 def report_download_webpage(self, file_id):
1993 """Report webpage download."""
1994 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995
1996 def report_extraction(self, file_id):
1997 """Report information extraction."""
1998 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999
2000 def _real_extract(self, url):
2001 file_id = url.split('/')[-1]
2002 # Rebuild url in english locale
2003 url = 'http://depositfiles.com/en/files/' + file_id
2004
2005 # Retrieve file webpage with 'Free download' button pressed
2006 free_download_indication = { 'gateway_result' : '1' }
2007 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008 try:
2009 self.report_download_webpage(file_id)
2010 webpage = compat_urllib_request.urlopen(request).read()
2011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2013 return
2014
2015 # Search for the real file URL
2016 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2017 if (mobj is None) or (mobj.group(1) is None):
2018 # Try to figure out reason of the error.
2019 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2020 if (mobj is not None) and (mobj.group(1) is not None):
2021 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2022 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023 else:
2024 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2025 return
2026
2027 file_url = mobj.group(1)
2028 file_extension = os.path.splitext(file_url)[1][1:]
2029
2030 # Search for file title
2031 mobj = re.search(r'<b title="(.*?)">', webpage)
2032 if mobj is None:
2033 self._downloader.trouble(u'ERROR: unable to extract title')
2034 return
2035 file_title = mobj.group(1).decode('utf-8')
2036
2037 return [{
2038 'id': file_id.decode('utf-8'),
2039 'url': file_url.decode('utf-8'),
2040 'uploader': None,
2041 'upload_date': None,
2042 'title': file_title,
2043 'ext': file_extension.decode('utf-8'),
2044 }]
2045
2046
2047 class FacebookIE(InfoExtractor):
2048 """Information Extractor for Facebook"""
2049
2050 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2051 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2052 _NETRC_MACHINE = 'facebook'
2053 IE_NAME = u'facebook'
2054
2055 def report_login(self):
2056 """Report attempt to log in."""
2057 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058
2059 def _real_initialize(self):
2060 if self._downloader is None:
2061 return
2062
2063 useremail = None
2064 password = None
2065 downloader_params = self._downloader.params
2066
2067 # Attempt to use provided username and password or .netrc data
2068 if downloader_params.get('username', None) is not None:
2069 useremail = downloader_params['username']
2070 password = downloader_params['password']
2071 elif downloader_params.get('usenetrc', False):
2072 try:
2073 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2074 if info is not None:
2075 useremail = info[0]
2076 password = info[2]
2077 else:
2078 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2079 except (IOError, netrc.NetrcParseError) as err:
2080 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2081 return
2082
2083 if useremail is None:
2084 return
2085
2086 # Log in
2087 login_form = {
2088 'email': useremail,
2089 'pass': password,
2090 'login': 'Log+In'
2091 }
2092 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2093 try:
2094 self.report_login()
2095 login_results = compat_urllib_request.urlopen(request).read()
2096 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2097 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098 return
2099 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2100 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2101 return
2102
2103 def _real_extract(self, url):
2104 mobj = re.match(self._VALID_URL, url)
2105 if mobj is None:
2106 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107 return
2108 video_id = mobj.group('ID')
2109
2110 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2111 webpage = self._download_webpage(url, video_id)
2112
2113 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2114 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2115 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116 if not m:
2117 raise ExtractorError(u'Cannot parse data')
2118 data = dict(json.loads(m.group(1)))
2119 params_raw = compat_urllib_parse.unquote(data['params'])
2120 params = json.loads(params_raw)
2121 video_url = params['hd_src']
2122 if not video_url:
2123 video_url = params['sd_src']
2124 if not video_url:
2125 raise ExtractorError(u'Cannot find video URL')
2126 video_duration = int(params['video_duration'])
2127
2128 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129 if not m:
2130 raise ExtractorError(u'Cannot find title in webpage')
2131 video_title = unescapeHTML(m.group(1))
2132
2133 info = {
2134 'id': video_id,
2135 'title': video_title,
2136 'url': video_url,
2137 'ext': 'mp4',
2138 'duration': video_duration,
2139 'thumbnail': params['thumbnail_src'],
2140 }
2141 return [info]
2142
2143
2144 class BlipTVIE(InfoExtractor):
2145 """Information extractor for blip.tv"""
2146
2147 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149 IE_NAME = u'blip.tv'
2150
2151 def report_extraction(self, file_id):
2152 """Report information extraction."""
2153 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155 def report_direct_download(self, title):
2156 """Report information extraction."""
2157 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159 def _real_extract(self, url):
2160 mobj = re.match(self._VALID_URL, url)
2161 if mobj is None:
2162 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163 return
2164
2165 if '?' in url:
2166 cchar = '&'
2167 else:
2168 cchar = '?'
2169 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2170 request = compat_urllib_request.Request(json_url)
2171 request.add_header('User-Agent', 'iTunes/10.6.1')
2172 self.report_extraction(mobj.group(1))
2173 info = None
2174 try:
2175 urlh = compat_urllib_request.urlopen(request)
2176 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2177 basename = url.split('/')[-1]
2178 title,ext = os.path.splitext(basename)
2179 title = title.decode('UTF-8')
2180 ext = ext.replace('.', '')
2181 self.report_direct_download(title)
2182 info = {
2183 'id': title,
2184 'url': url,
2185 'uploader': None,
2186 'upload_date': None,
2187 'title': title,
2188 'ext': ext,
2189 'urlhandle': urlh
2190 }
2191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2192 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2193 if info is None: # Regular URL
2194 try:
2195 json_code_bytes = urlh.read()
2196 json_code = json_code_bytes.decode('utf-8')
2197 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2198 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2199 return
2200
2201 try:
2202 json_data = json.loads(json_code)
2203 if 'Post' in json_data:
2204 data = json_data['Post']
2205 else:
2206 data = json_data
2207
2208 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2209 video_url = data['media']['url']
2210 umobj = re.match(self._URL_EXT, video_url)
2211 if umobj is None:
2212 raise ValueError('Can not determine filename extension')
2213 ext = umobj.group(1)
2214
2215 info = {
2216 'id': data['item_id'],
2217 'url': video_url,
2218 'uploader': data['display_name'],
2219 'upload_date': upload_date,
2220 'title': data['title'],
2221 'ext': ext,
2222 'format': data['media']['mimeType'],
2223 'thumbnail': data['thumbnailUrl'],
2224 'description': data['description'],
2225 'player_url': data['embedUrl'],
2226 'user_agent': 'iTunes/10.6.1',
2227 }
2228 except (ValueError,KeyError) as err:
2229 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2230 return
2231
2232 return [info]
2233
2234
2235 class MyVideoIE(InfoExtractor):
2236 """Information Extractor for myvideo.de."""
2237
2238 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2239 IE_NAME = u'myvideo'
2240
2241 def __init__(self, downloader=None):
2242 InfoExtractor.__init__(self, downloader)
2243
2244 def report_extraction(self, video_id):
2245 """Report information extraction."""
2246 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2247
2248 def _real_extract(self,url):
2249 mobj = re.match(self._VALID_URL, url)
2250 if mobj is None:
2251 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2252 return
2253
2254 video_id = mobj.group(1)
2255
2256 # Get video webpage
2257 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2258 webpage = self._download_webpage(webpage_url, video_id)
2259
2260 self.report_extraction(video_id)
2261 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2262 webpage)
2263 if mobj is None:
2264 self._downloader.trouble(u'ERROR: unable to extract media URL')
2265 return
2266 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2267
2268 mobj = re.search('<title>([^<]+)</title>', webpage)
2269 if mobj is None:
2270 self._downloader.trouble(u'ERROR: unable to extract title')
2271 return
2272
2273 video_title = mobj.group(1)
2274
2275 return [{
2276 'id': video_id,
2277 'url': video_url,
2278 'uploader': None,
2279 'upload_date': None,
2280 'title': video_title,
2281 'ext': u'flv',
2282 }]
2283
2284 class ComedyCentralIE(InfoExtractor):
2285 """Information extractor for The Daily Show and Colbert Report """
2286
2287 # urls can be abbreviations like :thedailyshow or :colbert
2288 # urls for episodes like:
2289 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2290 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2291 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2292 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2293 |(https?://)?(www\.)?
2294 (?P<showname>thedailyshow|colbertnation)\.com/
2295 (full-episodes/(?P<episode>.*)|
2296 (?P<clip>
2297 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2298 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2299 $"""
2300
2301 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2302
2303 _video_extensions = {
2304 '3500': 'mp4',
2305 '2200': 'mp4',
2306 '1700': 'mp4',
2307 '1200': 'mp4',
2308 '750': 'mp4',
2309 '400': 'mp4',
2310 }
2311 _video_dimensions = {
2312 '3500': '1280x720',
2313 '2200': '960x540',
2314 '1700': '768x432',
2315 '1200': '640x360',
2316 '750': '512x288',
2317 '400': '384x216',
2318 }
2319
2320 @classmethod
2321 def suitable(cls, url):
2322 """Receives a URL and returns True if suitable for this IE."""
2323 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2324
2325 def report_extraction(self, episode_id):
2326 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2327
2328 def report_config_download(self, episode_id, media_id):
2329 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2330
2331 def report_index_download(self, episode_id):
2332 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2333
2334 def _print_formats(self, formats):
2335 print('Available formats:')
2336 for x in formats:
2337 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2338
2339
2340 def _real_extract(self, url):
2341 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2342 if mobj is None:
2343 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2344 return
2345
2346 if mobj.group('shortname'):
2347 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2348 url = u'http://www.thedailyshow.com/full-episodes/'
2349 else:
2350 url = u'http://www.colbertnation.com/full-episodes/'
2351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352 assert mobj is not None
2353
2354 if mobj.group('clip'):
2355 if mobj.group('showname') == 'thedailyshow':
2356 epTitle = mobj.group('tdstitle')
2357 else:
2358 epTitle = mobj.group('cntitle')
2359 dlNewest = False
2360 else:
2361 dlNewest = not mobj.group('episode')
2362 if dlNewest:
2363 epTitle = mobj.group('showname')
2364 else:
2365 epTitle = mobj.group('episode')
2366
2367 req = compat_urllib_request.Request(url)
2368 self.report_extraction(epTitle)
2369 try:
2370 htmlHandle = compat_urllib_request.urlopen(req)
2371 html = htmlHandle.read()
2372 webpage = html.decode('utf-8')
2373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2374 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2375 return
2376 if dlNewest:
2377 url = htmlHandle.geturl()
2378 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2379 if mobj is None:
2380 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2381 return
2382 if mobj.group('episode') == '':
2383 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2384 return
2385 epTitle = mobj.group('episode')
2386
2387 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2388
2389 if len(mMovieParams) == 0:
2390 # The Colbert Report embeds the information in a without
2391 # a URL prefix; so extract the alternate reference
2392 # and then add the URL prefix manually.
2393
2394 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2395 if len(altMovieParams) == 0:
2396 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2397 return
2398 else:
2399 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2400
2401 uri = mMovieParams[0][1]
2402 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2403 self.report_index_download(epTitle)
2404 try:
2405 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2406 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2407 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2408 return
2409
2410 results = []
2411
2412 idoc = xml.etree.ElementTree.fromstring(indexXml)
2413 itemEls = idoc.findall('.//item')
2414 for partNum,itemEl in enumerate(itemEls):
2415 mediaId = itemEl.findall('./guid')[0].text
2416 shortMediaId = mediaId.split(':')[-1]
2417 showId = mediaId.split(':')[-2].replace('.com', '')
2418 officialTitle = itemEl.findall('./title')[0].text
2419 officialDate = itemEl.findall('./pubDate')[0].text
2420
2421 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2422 compat_urllib_parse.urlencode({'uri': mediaId}))
2423 configReq = compat_urllib_request.Request(configUrl)
2424 self.report_config_download(epTitle, shortMediaId)
2425 try:
2426 configXml = compat_urllib_request.urlopen(configReq).read()
2427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2429 return
2430
2431 cdoc = xml.etree.ElementTree.fromstring(configXml)
2432 turls = []
2433 for rendition in cdoc.findall('.//rendition'):
2434 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2435 turls.append(finfo)
2436
2437 if len(turls) == 0:
2438 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2439 continue
2440
2441 if self._downloader.params.get('listformats', None):
2442 self._print_formats([i[0] for i in turls])
2443 return
2444
2445 # For now, just pick the highest bitrate
2446 format,rtmp_video_url = turls[-1]
2447
2448 # Get the format arg from the arg stream
2449 req_format = self._downloader.params.get('format', None)
2450
2451 # Select format if we can find one
2452 for f,v in turls:
2453 if f == req_format:
2454 format, rtmp_video_url = f, v
2455 break
2456
2457 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2458 if not m:
2459 raise ExtractorError(u'Cannot transform RTMP url')
2460 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2461 video_url = base + m.group('finalid')
2462
2463 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2464 info = {
2465 'id': shortMediaId,
2466 'url': video_url,
2467 'uploader': showId,
2468 'upload_date': officialDate,
2469 'title': effTitle,
2470 'ext': 'mp4',
2471 'format': format,
2472 'thumbnail': None,
2473 'description': officialTitle,
2474 }
2475 results.append(info)
2476
2477 return results
2478
2479
2480 class EscapistIE(InfoExtractor):
2481 """Information extractor for The Escapist """
2482
2483 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2484 IE_NAME = u'escapist'
2485
2486 def report_extraction(self, showName):
2487 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2488
2489 def report_config_download(self, showName):
2490 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2491
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2494 if mobj is None:
2495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496 return
2497 showName = mobj.group('showname')
2498 videoId = mobj.group('episode')
2499
2500 self.report_extraction(showName)
2501 try:
2502 webPage = compat_urllib_request.urlopen(url)
2503 webPageBytes = webPage.read()
2504 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2505 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2506 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2507 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2508 return
2509
2510 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2511 description = unescapeHTML(descMatch.group(1))
2512 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2513 imgUrl = unescapeHTML(imgMatch.group(1))
2514 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2515 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2516 configUrlMatch = re.search('config=(.*)$', playerUrl)
2517 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2518
2519 self.report_config_download(showName)
2520 try:
2521 configJSON = compat_urllib_request.urlopen(configUrl)
2522 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2523 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2524 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2526 return
2527
2528 # Technically, it's JavaScript, not JSON
2529 configJSON = configJSON.replace("'", '"')
2530
2531 try:
2532 config = json.loads(configJSON)
2533 except (ValueError,) as err:
2534 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2535 return
2536
2537 playlist = config['playlist']
2538 videoUrl = playlist[1]['url']
2539
2540 info = {
2541 'id': videoId,
2542 'url': videoUrl,
2543 'uploader': showName,
2544 'upload_date': None,
2545 'title': showName,
2546 'ext': 'flv',
2547 'thumbnail': imgUrl,
2548 'description': description,
2549 'player_url': playerUrl,
2550 }
2551
2552 return [info]
2553
2554 class CollegeHumorIE(InfoExtractor):
2555 """Information extractor for collegehumor.com"""
2556
2557 _WORKING = False
2558 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2559 IE_NAME = u'collegehumor'
2560
2561 def report_manifest(self, video_id):
2562 """Report information extraction."""
2563 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2564
2565 def report_extraction(self, video_id):
2566 """Report information extraction."""
2567 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2568
2569 def _real_extract(self, url):
2570 mobj = re.match(self._VALID_URL, url)
2571 if mobj is None:
2572 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2573 return
2574 video_id = mobj.group('videoid')
2575
2576 info = {
2577 'id': video_id,
2578 'uploader': None,
2579 'upload_date': None,
2580 }
2581
2582 self.report_extraction(video_id)
2583 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2584 try:
2585 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2588 return
2589
2590 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2591 try:
2592 videoNode = mdoc.findall('./video')[0]
2593 info['description'] = videoNode.findall('./description')[0].text
2594 info['title'] = videoNode.findall('./caption')[0].text
2595 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2596 manifest_url = videoNode.findall('./file')[0].text
2597 except IndexError:
2598 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2599 return
2600
2601 manifest_url += '?hdcore=2.10.3'
2602 self.report_manifest(video_id)
2603 try:
2604 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2605 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2606 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2607 return
2608
2609 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2610 try:
2611 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2612 node_id = media_node.attrib['url']
2613 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2614 except IndexError as err:
2615 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2616 return
2617
2618 url_pr = compat_urllib_parse_urlparse(manifest_url)
2619 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2620
2621 info['url'] = url
2622 info['ext'] = 'f4f'
2623 return [info]
2624
2625
2626 class XVideosIE(InfoExtractor):
2627 """Information extractor for xvideos.com"""
2628
2629 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2630 IE_NAME = u'xvideos'
2631
2632 def report_extraction(self, video_id):
2633 """Report information extraction."""
2634 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2635
2636 def _real_extract(self, url):
2637 mobj = re.match(self._VALID_URL, url)
2638 if mobj is None:
2639 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2640 return
2641 video_id = mobj.group(1)
2642
2643 webpage = self._download_webpage(url, video_id)
2644
2645 self.report_extraction(video_id)
2646
2647
2648 # Extract video URL
2649 mobj = re.search(r'flv_url=(.+?)&', webpage)
2650 if mobj is None:
2651 self._downloader.trouble(u'ERROR: unable to extract video url')
2652 return
2653 video_url = compat_urllib_parse.unquote(mobj.group(1))
2654
2655
2656 # Extract title
2657 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2658 if mobj is None:
2659 self._downloader.trouble(u'ERROR: unable to extract video title')
2660 return
2661 video_title = mobj.group(1)
2662
2663
2664 # Extract video thumbnail
2665 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2666 if mobj is None:
2667 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2668 return
2669 video_thumbnail = mobj.group(0)
2670
2671 info = {
2672 'id': video_id,
2673 'url': video_url,
2674 'uploader': None,
2675 'upload_date': None,
2676 'title': video_title,
2677 'ext': 'flv',
2678 'thumbnail': video_thumbnail,
2679 'description': None,
2680 }
2681
2682 return [info]
2683
2684
2685 class SoundcloudIE(InfoExtractor):
2686 """Information extractor for soundcloud.com
2687 To access the media, the uid of the song and a stream token
2688 must be extracted from the page source and the script must make
2689 a request to media.soundcloud.com/crossdomain.xml. Then
2690 the media can be grabbed by requesting from an url composed
2691 of the stream token and uid
2692 """
2693
2694 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2695 IE_NAME = u'soundcloud'
2696
2697 def __init__(self, downloader=None):
2698 InfoExtractor.__init__(self, downloader)
2699
2700 def report_resolve(self, video_id):
2701 """Report information extraction."""
2702 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2703
2704 def report_extraction(self, video_id):
2705 """Report information extraction."""
2706 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2707
2708 def _real_extract(self, url):
2709 mobj = re.match(self._VALID_URL, url)
2710 if mobj is None:
2711 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2712 return
2713
2714 # extract uploader (which is in the url)
2715 uploader = mobj.group(1)
2716 # extract simple title (uploader + slug of song title)
2717 slug_title = mobj.group(2)
2718 simple_title = uploader + u'-' + slug_title
2719
2720 self.report_resolve('%s/%s' % (uploader, slug_title))
2721
2722 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2723 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2724 request = compat_urllib_request.Request(resolv_url)
2725 try:
2726 info_json_bytes = compat_urllib_request.urlopen(request).read()
2727 info_json = info_json_bytes.decode('utf-8')
2728 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2729 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2730 return
2731
2732 info = json.loads(info_json)
2733 video_id = info['id']
2734 self.report_extraction('%s/%s' % (uploader, slug_title))
2735
2736 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737 request = compat_urllib_request.Request(streams_url)
2738 try:
2739 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2740 stream_json = stream_json_bytes.decode('utf-8')
2741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2743 return
2744
2745 streams = json.loads(stream_json)
2746 mediaURL = streams['http_mp3_128_url']
2747
2748 return [{
2749 'id': info['id'],
2750 'url': mediaURL,
2751 'uploader': info['user']['username'],
2752 'upload_date': info['created_at'],
2753 'title': info['title'],
2754 'ext': u'mp3',
2755 'description': info['description'],
2756 }]
2757
2758
2759 class InfoQIE(InfoExtractor):
2760 """Information extractor for infoq.com"""
2761 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2762
2763 def report_extraction(self, video_id):
2764 """Report information extraction."""
2765 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2766
2767 def _real_extract(self, url):
2768 mobj = re.match(self._VALID_URL, url)
2769 if mobj is None:
2770 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2771 return
2772
2773 webpage = self._download_webpage(url, video_id=url)
2774 self.report_extraction(url)
2775
2776 # Extract video URL
2777 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2778 if mobj is None:
2779 self._downloader.trouble(u'ERROR: unable to extract video url')
2780 return
2781 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2782 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2783
2784 # Extract title
2785 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2786 if mobj is None:
2787 self._downloader.trouble(u'ERROR: unable to extract video title')
2788 return
2789 video_title = mobj.group(1)
2790
2791 # Extract description
2792 video_description = u'No description available.'
2793 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2794 if mobj is not None:
2795 video_description = mobj.group(1)
2796
2797 video_filename = video_url.split('/')[-1]
2798 video_id, extension = video_filename.split('.')
2799
2800 info = {
2801 'id': video_id,
2802 'url': video_url,
2803 'uploader': None,
2804 'upload_date': None,
2805 'title': video_title,
2806 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2807 'thumbnail': None,
2808 'description': video_description,
2809 }
2810
2811 return [info]
2812
2813 class MixcloudIE(InfoExtractor):
2814 """Information extractor for www.mixcloud.com"""
2815
2816 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2817 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2818 IE_NAME = u'mixcloud'
2819
2820 def __init__(self, downloader=None):
2821 InfoExtractor.__init__(self, downloader)
2822
2823 def report_download_json(self, file_id):
2824 """Report JSON download."""
2825 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2826
2827 def report_extraction(self, file_id):
2828 """Report information extraction."""
2829 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2830
2831 def get_urls(self, jsonData, fmt, bitrate='best'):
2832 """Get urls from 'audio_formats' section in json"""
2833 file_url = None
2834 try:
2835 bitrate_list = jsonData[fmt]
2836 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2837 bitrate = max(bitrate_list) # select highest
2838
2839 url_list = jsonData[fmt][bitrate]
2840 except TypeError: # we have no bitrate info.
2841 url_list = jsonData[fmt]
2842 return url_list
2843
2844 def check_urls(self, url_list):
2845 """Returns 1st active url from list"""
2846 for url in url_list:
2847 try:
2848 compat_urllib_request.urlopen(url)
2849 return url
2850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2851 url = None
2852
2853 return None
2854
2855 def _print_formats(self, formats):
2856 print('Available formats:')
2857 for fmt in formats.keys():
2858 for b in formats[fmt]:
2859 try:
2860 ext = formats[fmt][b][0]
2861 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2862 except TypeError: # we have no bitrate info
2863 ext = formats[fmt][0]
2864 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2865 break
2866
2867 def _real_extract(self, url):
2868 mobj = re.match(self._VALID_URL, url)
2869 if mobj is None:
2870 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2871 return
2872 # extract uploader & filename from url
2873 uploader = mobj.group(1).decode('utf-8')
2874 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2875
2876 # construct API request
2877 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2878 # retrieve .json file with links to files
2879 request = compat_urllib_request.Request(file_url)
2880 try:
2881 self.report_download_json(file_url)
2882 jsonData = compat_urllib_request.urlopen(request).read()
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2885 return
2886
2887 # parse JSON
2888 json_data = json.loads(jsonData)
2889 player_url = json_data['player_swf_url']
2890 formats = dict(json_data['audio_formats'])
2891
2892 req_format = self._downloader.params.get('format', None)
2893 bitrate = None
2894
2895 if self._downloader.params.get('listformats', None):
2896 self._print_formats(formats)
2897 return
2898
2899 if req_format is None or req_format == 'best':
2900 for format_param in formats.keys():
2901 url_list = self.get_urls(formats, format_param)
2902 # check urls
2903 file_url = self.check_urls(url_list)
2904 if file_url is not None:
2905 break # got it!
2906 else:
2907 if req_format not in formats:
2908 self._downloader.trouble(u'ERROR: format is not available')
2909 return
2910
2911 url_list = self.get_urls(formats, req_format)
2912 file_url = self.check_urls(url_list)
2913 format_param = req_format
2914
2915 return [{
2916 'id': file_id.decode('utf-8'),
2917 'url': file_url.decode('utf-8'),
2918 'uploader': uploader.decode('utf-8'),
2919 'upload_date': None,
2920 'title': json_data['name'],
2921 'ext': file_url.split('.')[-1].decode('utf-8'),
2922 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2923 'thumbnail': json_data['thumbnail_url'],
2924 'description': json_data['description'],
2925 'player_url': player_url.decode('utf-8'),
2926 }]
2927
2928 class StanfordOpenClassroomIE(InfoExtractor):
2929 """Information extractor for Stanford's Open ClassRoom"""
2930
2931 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2932 IE_NAME = u'stanfordoc'
2933
2934 def report_download_webpage(self, objid):
2935 """Report information extraction."""
2936 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2937
2938 def report_extraction(self, video_id):
2939 """Report information extraction."""
2940 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2941
2942 def _real_extract(self, url):
2943 mobj = re.match(self._VALID_URL, url)
2944 if mobj is None:
2945 raise ExtractorError(u'Invalid URL: %s' % url)
2946
2947 if mobj.group('course') and mobj.group('video'): # A specific video
2948 course = mobj.group('course')
2949 video = mobj.group('video')
2950 info = {
2951 'id': course + '_' + video,
2952 'uploader': None,
2953 'upload_date': None,
2954 }
2955
2956 self.report_extraction(info['id'])
2957 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2958 xmlUrl = baseUrl + video + '.xml'
2959 try:
2960 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2961 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2962 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2963 return
2964 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2965 try:
2966 info['title'] = mdoc.findall('./title')[0].text
2967 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2968 except IndexError:
2969 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2970 return
2971 info['ext'] = info['url'].rpartition('.')[2]
2972 return [info]
2973 elif mobj.group('course'): # A course page
2974 course = mobj.group('course')
2975 info = {
2976 'id': course,
2977 'type': 'playlist',
2978 'uploader': None,
2979 'upload_date': None,
2980 }
2981
2982 coursepage = self._download_webpage(url, info['id'],
2983 note='Downloading course info page',
2984 errnote='Unable to download course info page')
2985
2986 m = re.search('<h1>([^<]+)</h1>', coursepage)
2987 if m:
2988 info['title'] = unescapeHTML(m.group(1))
2989 else:
2990 info['title'] = info['id']
2991
2992 m = re.search('<description>([^<]+)</description>', coursepage)
2993 if m:
2994 info['description'] = unescapeHTML(m.group(1))
2995
2996 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2997 info['list'] = [
2998 {
2999 'type': 'reference',
3000 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3001 }
3002 for vpage in links]
3003 results = []
3004 for entry in info['list']:
3005 assert entry['type'] == 'reference'
3006 results += self.extract(entry['url'])
3007 return results
3008 else: # Root page
3009 info = {
3010 'id': 'Stanford OpenClassroom',
3011 'type': 'playlist',
3012 'uploader': None,
3013 'upload_date': None,
3014 }
3015
3016 self.report_download_webpage(info['id'])
3017 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3018 try:
3019 rootpage = compat_urllib_request.urlopen(rootURL).read()
3020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3022 return
3023
3024 info['title'] = info['id']
3025
3026 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3027 info['list'] = [
3028 {
3029 'type': 'reference',
3030 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3031 }
3032 for cpage in links]
3033
3034 results = []
3035 for entry in info['list']:
3036 assert entry['type'] == 'reference'
3037 results += self.extract(entry['url'])
3038 return results
3039
3040 class MTVIE(InfoExtractor):
3041 """Information extractor for MTV.com"""
3042
3043 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3044 IE_NAME = u'mtv'
3045
3046 def report_extraction(self, video_id):
3047 """Report information extraction."""
3048 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3049
3050 def _real_extract(self, url):
3051 mobj = re.match(self._VALID_URL, url)
3052 if mobj is None:
3053 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3054 return
3055 if not mobj.group('proto'):
3056 url = 'http://' + url
3057 video_id = mobj.group('videoid')
3058
3059 webpage = self._download_webpage(url, video_id)
3060
3061 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062 if mobj is None:
3063 self._downloader.trouble(u'ERROR: unable to extract song name')
3064 return
3065 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067 if mobj is None:
3068 self._downloader.trouble(u'ERROR: unable to extract performer')
3069 return
3070 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071 video_title = performer + ' - ' + song_name
3072
3073 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074 if mobj is None:
3075 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3076 return
3077 mtvn_uri = mobj.group(1)
3078
3079 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080 if mobj is None:
3081 self._downloader.trouble(u'ERROR: unable to extract content id')
3082 return
3083 content_id = mobj.group(1)
3084
3085 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086 self.report_extraction(video_id)
3087 request = compat_urllib_request.Request(videogen_url)
3088 try:
3089 metadataXml = compat_urllib_request.urlopen(request).read()
3090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3092 return
3093
3094 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095 renditions = mdoc.findall('.//rendition')
3096
3097 # For now, always pick the highest quality.
3098 rendition = renditions[-1]
3099
3100 try:
3101 _,_,ext = rendition.attrib['type'].partition('/')
3102 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103 video_url = rendition.find('./src').text
3104 except KeyError:
3105 self._downloader.trouble('Invalid rendition field.')
3106 return
3107
3108 info = {
3109 'id': video_id,
3110 'url': video_url,
3111 'uploader': performer,
3112 'upload_date': None,
3113 'title': video_title,
3114 'ext': ext,
3115 'format': format,
3116 }
3117
3118 return [info]
3119
3120
3121 class YoukuIE(InfoExtractor):
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123
3124 def report_download_webpage(self, file_id):
3125 """Report webpage download."""
3126 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3127
3128 def report_extraction(self, file_id):
3129 """Report information extraction."""
3130 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3131
3132 def _gen_sid(self):
3133 nowTime = int(time.time() * 1000)
3134 random1 = random.randint(1000,1998)
3135 random2 = random.randint(1000,9999)
3136
3137 return "%d%d%d" %(nowTime,random1,random2)
3138
3139 def _get_file_ID_mix_string(self, seed):
3140 mixed = []
3141 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3142 seed = float(seed)
3143 for i in range(len(source)):
3144 seed = (seed * 211 + 30031 ) % 65536
3145 index = math.floor(seed / 65536 * len(source) )
3146 mixed.append(source[int(index)])
3147 source.remove(source[int(index)])
3148 #return ''.join(mixed)
3149 return mixed
3150
3151 def _get_file_id(self, fileId, seed):
3152 mixed = self._get_file_ID_mix_string(seed)
3153 ids = fileId.split('*')
3154 realId = []
3155 for ch in ids:
3156 if ch:
3157 realId.append(mixed[int(ch)])
3158 return ''.join(realId)
3159
3160 def _real_extract(self, url):
3161 mobj = re.match(self._VALID_URL, url)
3162 if mobj is None:
3163 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3164 return
3165 video_id = mobj.group('ID')
3166
3167 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3168
3169 request = compat_urllib_request.Request(info_url, None, std_headers)
3170 try:
3171 self.report_download_webpage(video_id)
3172 jsondata = compat_urllib_request.urlopen(request).read()
3173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3175 return
3176
3177 self.report_extraction(video_id)
3178 try:
3179 jsonstr = jsondata.decode('utf-8')
3180 config = json.loads(jsonstr)
3181
3182 video_title = config['data'][0]['title']
3183 seed = config['data'][0]['seed']
3184
3185 format = self._downloader.params.get('format', None)
3186 supported_format = list(config['data'][0]['streamfileids'].keys())
3187
3188 if format is None or format == 'best':
3189 if 'hd2' in supported_format:
3190 format = 'hd2'
3191 else:
3192 format = 'flv'
3193 ext = u'flv'
3194 elif format == 'worst':
3195 format = 'mp4'
3196 ext = u'mp4'
3197 else:
3198 format = 'flv'
3199 ext = u'flv'
3200
3201
3202 fileid = config['data'][0]['streamfileids'][format]
3203 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3204 except (UnicodeDecodeError, ValueError, KeyError):
3205 self._downloader.trouble(u'ERROR: unable to extract info section')
3206 return
3207
3208 files_info=[]
3209 sid = self._gen_sid()
3210 fileid = self._get_file_id(fileid, seed)
3211
3212 #column 8,9 of fileid represent the segment number
3213 #fileid[7:9] should be changed
3214 for index, key in enumerate(keys):
3215
3216 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3217 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3218
3219 info = {
3220 'id': '%s_part%02d' % (video_id, index),
3221 'url': download_url,
3222 'uploader': None,
3223 'upload_date': None,
3224 'title': video_title,
3225 'ext': ext,
3226 }
3227 files_info.append(info)
3228
3229 return files_info
3230
3231
3232 class XNXXIE(InfoExtractor):
3233 """Information extractor for xnxx.com"""
3234
3235 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3236 IE_NAME = u'xnxx'
3237 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3238 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3239 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3240
3241 def report_webpage(self, video_id):
3242 """Report information extraction"""
3243 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3244
3245 def report_extraction(self, video_id):
3246 """Report information extraction"""
3247 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3248
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3251 if mobj is None:
3252 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3253 return
3254 video_id = mobj.group(1)
3255
3256 self.report_webpage(video_id)
3257
3258 # Get webpage content
3259 try:
3260 webpage_bytes = compat_urllib_request.urlopen(url).read()
3261 webpage = webpage_bytes.decode('utf-8')
3262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3264 return
3265
3266 result = re.search(self.VIDEO_URL_RE, webpage)
3267 if result is None:
3268 self._downloader.trouble(u'ERROR: unable to extract video url')
3269 return
3270 video_url = compat_urllib_parse.unquote(result.group(1))
3271
3272 result = re.search(self.VIDEO_TITLE_RE, webpage)
3273 if result is None:
3274 self._downloader.trouble(u'ERROR: unable to extract video title')
3275 return
3276 video_title = result.group(1)
3277
3278 result = re.search(self.VIDEO_THUMB_RE, webpage)
3279 if result is None:
3280 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3281 return
3282 video_thumbnail = result.group(1)
3283
3284 return [{
3285 'id': video_id,
3286 'url': video_url,
3287 'uploader': None,
3288 'upload_date': None,
3289 'title': video_title,
3290 'ext': 'flv',
3291 'thumbnail': video_thumbnail,
3292 'description': None,
3293 }]
3294
3295
3296 class GooglePlusIE(InfoExtractor):
3297 """Information extractor for plus.google.com."""
3298
3299 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3300 IE_NAME = u'plus.google'
3301
3302 def __init__(self, downloader=None):
3303 InfoExtractor.__init__(self, downloader)
3304
3305 def report_extract_entry(self, url):
3306 """Report downloading extry"""
3307 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3308
3309 def report_date(self, upload_date):
3310 """Report downloading extry"""
3311 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3312
3313 def report_uploader(self, uploader):
3314 """Report downloading extry"""
3315 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3316
3317 def report_title(self, video_title):
3318 """Report downloading extry"""
3319 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3320
3321 def report_extract_vid_page(self, video_page):
3322 """Report information extraction."""
3323 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3324
3325 def _real_extract(self, url):
3326 # Extract id from URL
3327 mobj = re.match(self._VALID_URL, url)
3328 if mobj is None:
3329 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3330 return
3331
3332 post_url = mobj.group(0)
3333 video_id = mobj.group(1)
3334
3335 video_extension = 'flv'
3336
3337 # Step 1, Retrieve post webpage to extract further information
3338 self.report_extract_entry(post_url)
3339 request = compat_urllib_request.Request(post_url)
3340 try:
3341 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3343 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3344 return
3345
3346 # Extract update date
3347 upload_date = None
3348 pattern = 'title="Timestamp">(.*?)</a>'
3349 mobj = re.search(pattern, webpage)
3350 if mobj:
3351 upload_date = mobj.group(1)
3352 # Convert timestring to a format suitable for filename
3353 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3354 upload_date = upload_date.strftime('%Y%m%d')
3355 self.report_date(upload_date)
3356
3357 # Extract uploader
3358 uploader = None
3359 pattern = r'rel\="author".*?>(.*?)</a>'
3360 mobj = re.search(pattern, webpage)
3361 if mobj:
3362 uploader = mobj.group(1)
3363 self.report_uploader(uploader)
3364
3365 # Extract title
3366 # Get the first line for title
3367 video_title = u'NA'
3368 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3369 mobj = re.search(pattern, webpage)
3370 if mobj:
3371 video_title = mobj.group(1)
3372 self.report_title(video_title)
3373
3374 # Step 2, Stimulate clicking the image box to launch video
3375 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3376 mobj = re.search(pattern, webpage)
3377 if mobj is None:
3378 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3379
3380 video_page = mobj.group(1)
3381 request = compat_urllib_request.Request(video_page)
3382 try:
3383 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3384 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3385 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3386 return
3387 self.report_extract_vid_page(video_page)
3388
3389
3390 # Extract video links on video page
3391 """Extract video links of all sizes"""
3392 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3393 mobj = re.findall(pattern, webpage)
3394 if len(mobj) == 0:
3395 self._downloader.trouble(u'ERROR: unable to extract video links')
3396
3397 # Sort in resolution
3398 links = sorted(mobj)
3399
3400 # Choose the lowest of the sort, i.e. highest resolution
3401 video_url = links[-1]
3402 # Only get the url. The resolution part in the tuple has no use anymore
3403 video_url = video_url[-1]
3404 # Treat escaped \u0026 style hex
3405 try:
3406 video_url = video_url.decode("unicode_escape")
3407 except AttributeError: # Python 3
3408 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3409
3410
3411 return [{
3412 'id': video_id,
3413 'url': video_url,
3414 'uploader': uploader,
3415 'upload_date': upload_date,
3416 'title': video_title,
3417 'ext': video_extension,
3418 }]
3419
3420 class NBAIE(InfoExtractor):
3421 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3422 IE_NAME = u'nba'
3423
3424 def _real_extract(self, url):
3425 mobj = re.match(self._VALID_URL, url)
3426 if mobj is None:
3427 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428 return
3429
3430 video_id = mobj.group(1)
3431 if video_id.endswith('/index.html'):
3432 video_id = video_id[:-len('/index.html')]
3433
3434 webpage = self._download_webpage(url, video_id)
3435
3436 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3437 def _findProp(rexp, default=None):
3438 m = re.search(rexp, webpage)
3439 if m:
3440 return unescapeHTML(m.group(1))
3441 else:
3442 return default
3443
3444 shortened_video_id = video_id.rpartition('/')[2]
3445 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3446 info = {
3447 'id': shortened_video_id,
3448 'url': video_url,
3449 'ext': 'mp4',
3450 'title': title,
3451 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3452 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3453 }
3454 return [info]
3455
3456 class JustinTVIE(InfoExtractor):
3457 """Information extractor for justin.tv and twitch.tv"""
3458 # TODO: One broadcast may be split into multiple videos. The key
3459 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3460 # starts at 1 and increases. Can we treat all parts as one video?
3461
3462 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3463 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3464 _JUSTIN_PAGE_LIMIT = 100
3465 IE_NAME = u'justin.tv'
3466
3467 def report_extraction(self, file_id):
3468 """Report information extraction."""
3469 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3470
3471 def report_download_page(self, channel, offset):
3472 """Report attempt to download a single page of videos."""
3473 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3474 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3475
3476 # Return count of items, list of *valid* items
3477 def _parse_page(self, url):
3478 try:
3479 urlh = compat_urllib_request.urlopen(url)
3480 webpage_bytes = urlh.read()
3481 webpage = webpage_bytes.decode('utf-8', 'ignore')
3482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3484 return
3485
3486 response = json.loads(webpage)
3487 if type(response) != list:
3488 error_text = response.get('error', 'unknown error')
3489 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3490 return
3491 info = []
3492 for clip in response:
3493 video_url = clip['video_file_url']
3494 if video_url:
3495 video_extension = os.path.splitext(video_url)[1][1:]
3496 video_date = re.sub('-', '', clip['start_time'][:10])
3497 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3498 video_id = clip['id']
3499 video_title = clip.get('title', video_id)
3500 info.append({
3501 'id': video_id,
3502 'url': video_url,
3503 'title': video_title,
3504 'uploader': clip.get('channel_name', video_uploader_id),
3505 'uploader_id': video_uploader_id,
3506 'upload_date': video_date,
3507 'ext': video_extension,
3508 })
3509 return (len(response), info)
3510
3511 def _real_extract(self, url):
3512 mobj = re.match(self._VALID_URL, url)
3513 if mobj is None:
3514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3515 return
3516
3517 api = 'http://api.justin.tv'
3518 video_id = mobj.group(mobj.lastindex)
3519 paged = False
3520 if mobj.lastindex == 1:
3521 paged = True
3522 api += '/channel/archives/%s.json'
3523 else:
3524 api += '/broadcast/by_archive/%s.json'
3525 api = api % (video_id,)
3526
3527 self.report_extraction(video_id)
3528
3529 info = []
3530 offset = 0
3531 limit = self._JUSTIN_PAGE_LIMIT
3532 while True:
3533 if paged:
3534 self.report_download_page(video_id, offset)
3535 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3536 page_count, page_info = self._parse_page(page_url)
3537 info.extend(page_info)
3538 if not paged or page_count != limit:
3539 break
3540 offset += limit
3541 return info
3542
3543 class FunnyOrDieIE(InfoExtractor):
3544 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3545
3546 def _real_extract(self, url):
3547 mobj = re.match(self._VALID_URL, url)
3548 if mobj is None:
3549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3550 return
3551
3552 video_id = mobj.group('id')
3553 webpage = self._download_webpage(url, video_id)
3554
3555 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3556 if not m:
3557 self._downloader.trouble(u'ERROR: unable to find video information')
3558 video_url = unescapeHTML(m.group('url'))
3559
3560 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3561 if not m:
3562 self._downloader.trouble(u'Cannot find video title')
3563 title = unescapeHTML(m.group('title'))
3564
3565 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3566 if m:
3567 desc = unescapeHTML(m.group('desc'))
3568 else:
3569 desc = None
3570
3571 info = {
3572 'id': video_id,
3573 'url': video_url,
3574 'ext': 'mp4',
3575 'title': title,
3576 'description': desc,
3577 }
3578 return [info]
3579
3580 class TweetReelIE(InfoExtractor):
3581 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3582
3583 def _real_extract(self, url):
3584 mobj = re.match(self._VALID_URL, url)
3585 if mobj is None:
3586 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3587 return
3588
3589 video_id = mobj.group('id')
3590 webpage = self._download_webpage(url, video_id)
3591
3592 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3593 if not m:
3594 self._downloader.trouble(u'ERROR: Cannot find status ID')
3595 status_id = m.group(1)
3596
3597 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3598 if not m:
3599 self._downloader.trouble(u'WARNING: Cannot find description')
3600 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3601
3602 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3603 if not m:
3604 self._downloader.trouble(u'ERROR: Cannot find uploader')
3605 uploader = unescapeHTML(m.group('uploader'))
3606 uploader_id = unescapeHTML(m.group('uploader_id'))
3607
3608 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3609 if not m:
3610 self._downloader.trouble(u'ERROR: Cannot find upload date')
3611 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3612
3613 title = desc
3614 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3615
3616 info = {
3617 'id': video_id,
3618 'url': video_url,
3619 'ext': 'mov',
3620 'title': title,
3621 'description': desc,
3622 'uploader': uploader,
3623 'uploader_id': uploader_id,
3624 'internal_id': status_id,
3625 'upload_date': upload_date
3626 }
3627 return [info]
3628
3629 class SteamIE(InfoExtractor):
3630 _VALID_URL = r"""http://store.steampowered.com/
3631 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3632 (?P<gameID>\d+)/?
3633 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3634 """
3635
3636 @classmethod
3637 def suitable(cls, url):
3638 """Receives a URL and returns True if suitable for this IE."""
3639 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3640
3641 def _real_extract(self, url):
3642 m = re.match(self._VALID_URL, url, re.VERBOSE)
3643 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3644 gameID = m.group('gameID')
3645 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3646 webpage = self._download_webpage(videourl, gameID)
3647 mweb = re.finditer(urlRE, webpage)
3648 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3649 titles = re.finditer(namesRE, webpage)
3650 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3651 thumbs = re.finditer(thumbsRE, webpage)
3652 videos = []
3653 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3654 video_id = vid.group('videoID')
3655 title = vtitle.group('videoName')
3656 video_url = vid.group('videoURL')
3657 video_thumb = thumb.group('thumbnail')
3658 if not video_url:
3659 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3660 info = {
3661 'id':video_id,
3662 'url':video_url,
3663 'ext': 'flv',
3664 'title': unescapeHTML(title),
3665 'thumbnail': video_thumb
3666 }
3667 videos.append(info)
3668 return videos
3669
3670 class UstreamIE(InfoExtractor):
3671 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3672 IE_NAME = u'ustream'
3673
3674 def _real_extract(self, url):
3675 m = re.match(self._VALID_URL, url)
3676 video_id = m.group('videoID')
3677 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3678 webpage = self._download_webpage(url, video_id)
3679 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3680 title = m.group('title')
3681 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3682 uploader = m.group('uploader')
3683 info = {
3684 'id':video_id,
3685 'url':video_url,
3686 'ext': 'flv',
3687 'title': title,
3688 'uploader': uploader
3689 }
3690 return [info]
3691
3692 class RBMARadioIE(InfoExtractor):
3693 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3694
3695 def _real_extract(self, url):
3696 m = re.match(self._VALID_URL, url)
3697 video_id = m.group('videoID')
3698
3699 webpage = self._download_webpage(url, video_id)
3700 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3701 if not m:
3702 raise ExtractorError(u'Cannot find metadata')
3703 json_data = m.group(1)
3704
3705 try:
3706 data = json.loads(json_data)
3707 except ValueError as e:
3708 raise ExtractorError(u'Invalid JSON: ' + str(e))
3709
3710 video_url = data['akamai_url'] + '&cbr=256'
3711 url_parts = compat_urllib_parse_urlparse(video_url)
3712 video_ext = url_parts.path.rpartition('.')[2]
3713 info = {
3714 'id': video_id,
3715 'url': video_url,
3716 'ext': video_ext,
3717 'title': data['title'],
3718 'description': data.get('teaser_text'),
3719 'location': data.get('country_of_origin'),
3720 'uploader': data.get('host', {}).get('name'),
3721 'uploader_id': data.get('host', {}).get('slug'),
3722 'thumbnail': data.get('image', {}).get('large_url_2x'),
3723 'duration': data.get('duration'),
3724 }
3725 return [info]
3726
3727
3728 class YouPornIE(InfoExtractor):
3729 """Information extractor for youporn.com."""
3730 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3731
3732 def _print_formats(self, formats):
3733 """Print all available formats"""
3734 print(u'Available formats:')
3735 print(u'ext\t\tformat')
3736 print(u'---------------------------------')
3737 for format in formats:
3738 print(u'%s\t\t%s' % (format['ext'], format['format']))
3739
3740 def _specific(self, req_format, formats):
3741 for x in formats:
3742 if(x["format"]==req_format):
3743 return x
3744 return None
3745
3746 def _real_extract(self, url):
3747 mobj = re.match(self._VALID_URL, url)
3748 if mobj is None:
3749 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3750 return
3751
3752 video_id = mobj.group('videoid')
3753
3754 req = compat_urllib_request.Request(url)
3755 req.add_header('Cookie', 'age_verified=1')
3756 webpage = self._download_webpage(req, video_id)
3757
3758 # Get the video title
3759 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3760 if result is None:
3761 raise ExtractorError(u'Unable to extract video title')
3762 video_title = result.group('title').strip()
3763
3764 # Get the video date
3765 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3766 if result is None:
3767 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3768 upload_date = None
3769 else:
3770 upload_date = result.group('date').strip()
3771
3772 # Get the video uploader
3773 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3774 if result is None:
3775 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3776 video_uploader = None
3777 else:
3778 video_uploader = result.group('uploader').strip()
3779 video_uploader = clean_html( video_uploader )
3780
3781 # Get all of the formats available
3782 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3783 result = re.search(DOWNLOAD_LIST_RE, webpage)
3784 if result is None:
3785 raise ExtractorError(u'Unable to extract download list')
3786 download_list_html = result.group('download_list').strip()
3787
3788 # Get all of the links from the page
3789 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3790 links = re.findall(LINK_RE, download_list_html)
3791 if(len(links) == 0):
3792 raise ExtractorError(u'ERROR: no known formats available for video')
3793
3794 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3795
3796 formats = []
3797 for link in links:
3798
3799 # A link looks like this:
3800 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3801 # A path looks like this:
3802 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3803 video_url = unescapeHTML( link )
3804 path = compat_urllib_parse_urlparse( video_url ).path
3805 extension = os.path.splitext( path )[1][1:]
3806 format = path.split('/')[4].split('_')[:2]
3807 size = format[0]
3808 bitrate = format[1]
3809 format = "-".join( format )
3810 title = u'%s-%s-%s' % (video_title, size, bitrate)
3811
3812 formats.append({
3813 'id': video_id,
3814 'url': video_url,
3815 'uploader': video_uploader,
3816 'upload_date': upload_date,
3817 'title': title,
3818 'ext': extension,
3819 'format': format,
3820 'thumbnail': None,
3821 'description': None,
3822 'player_url': None
3823 })
3824
3825 if self._downloader.params.get('listformats', None):
3826 self._print_formats(formats)
3827 return
3828
3829 req_format = self._downloader.params.get('format', None)
3830 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3831
3832 if req_format is None or req_format == 'best':
3833 return [formats[0]]
3834 elif req_format == 'worst':
3835 return [formats[-1]]
3836 elif req_format in ('-1', 'all'):
3837 return formats
3838 else:
3839 format = self._specific( req_format, formats )
3840 if result is None:
3841 self._downloader.trouble(u'ERROR: requested format not available')
3842 return
3843 return [format]
3844
3845
3846
3847 class PornotubeIE(InfoExtractor):
3848 """Information extractor for pornotube.com."""
3849 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3850
3851 def _real_extract(self, url):
3852 mobj = re.match(self._VALID_URL, url)
3853 if mobj is None:
3854 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3855 return
3856
3857 video_id = mobj.group('videoid')
3858 video_title = mobj.group('title')
3859
3860 # Get webpage content
3861 webpage = self._download_webpage(url, video_id)
3862
3863 # Get the video URL
3864 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3865 result = re.search(VIDEO_URL_RE, webpage)
3866 if result is None:
3867 self._downloader.trouble(u'ERROR: unable to extract video url')
3868 return
3869 video_url = compat_urllib_parse.unquote(result.group('url'))
3870
3871 #Get the uploaded date
3872 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3873 result = re.search(VIDEO_UPLOADED_RE, webpage)
3874 if result is None:
3875 self._downloader.trouble(u'ERROR: unable to extract video title')
3876 return
3877 upload_date = result.group('date')
3878
3879 info = {'id': video_id,
3880 'url': video_url,
3881 'uploader': None,
3882 'upload_date': upload_date,
3883 'title': video_title,
3884 'ext': 'flv',
3885 'format': 'flv'}
3886
3887 return [info]
3888
3889 class YouJizzIE(InfoExtractor):
3890 """Information extractor for youjizz.com."""
3891 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3892
3893 def _real_extract(self, url):
3894 mobj = re.match(self._VALID_URL, url)
3895 if mobj is None:
3896 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3897 return
3898
3899 video_id = mobj.group('videoid')
3900
3901 # Get webpage content
3902 webpage = self._download_webpage(url, video_id)
3903
3904 # Get the video title
3905 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3906 if result is None:
3907 raise ExtractorError(u'ERROR: unable to extract video title')
3908 video_title = result.group('title').strip()
3909
3910 # Get the embed page
3911 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3912 if result is None:
3913 raise ExtractorError(u'ERROR: unable to extract embed page')
3914
3915 embed_page_url = result.group(0).strip()
3916 video_id = result.group('videoid')
3917
3918 webpage = self._download_webpage(embed_page_url, video_id)
3919
3920 # Get the video URL
3921 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3922 if result is None:
3923 raise ExtractorError(u'ERROR: unable to extract video url')
3924 video_url = result.group('source')
3925
3926 info = {'id': video_id,
3927 'url': video_url,
3928 'title': video_title,
3929 'ext': 'flv',
3930 'format': 'flv',
3931 'player_url': embed_page_url}
3932
3933 return [info]
3934
3935 class EightTracksIE(InfoExtractor):
3936 IE_NAME = '8tracks'
3937 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3938
3939 def _real_extract(self, url):
3940 mobj = re.match(self._VALID_URL, url)
3941 if mobj is None:
3942 raise ExtractorError(u'Invalid URL: %s' % url)
3943 playlist_id = mobj.group('id')
3944
3945 webpage = self._download_webpage(url, playlist_id)
3946
3947 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3948 if not m:
3949 raise ExtractorError(u'Cannot find trax information')
3950 json_like = m.group(1)
3951 data = json.loads(json_like)
3952
3953 session = str(random.randint(0, 1000000000))
3954 mix_id = data['id']
3955 track_count = data['tracks_count']
3956 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3957 next_url = first_url
3958 res = []
3959 for i in itertools.count():
3960 api_json = self._download_webpage(next_url, playlist_id,
3961 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3962 errnote=u'Failed to download song information')
3963 api_data = json.loads(api_json)
3964 track_data = api_data[u'set']['track']
3965 info = {
3966 'id': track_data['id'],
3967 'url': track_data['track_file_stream_url'],
3968 'title': track_data['performer'] + u' - ' + track_data['name'],
3969 'raw_title': track_data['name'],
3970 'uploader_id': data['user']['login'],
3971 'ext': 'm4a',
3972 }
3973 res.append(info)
3974 if api_data['set']['at_last_track']:
3975 break
3976 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3977 return res
3978
3979 class KeekIE(InfoExtractor):
3980 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3981 IE_NAME = u'keek'
3982
3983 def _real_extract(self, url):
3984 m = re.match(self._VALID_URL, url)
3985 video_id = m.group('videoID')
3986 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3987 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3988 webpage = self._download_webpage(url, video_id)
3989 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3990 title = unescapeHTML(m.group('title'))
3991 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3992 uploader = unescapeHTML(m.group('uploader'))
3993 info = {
3994 'id':video_id,
3995 'url':video_url,
3996 'ext': 'mp4',
3997 'title': title,
3998 'thumbnail': thumbnail,
3999 'uploader': uploader
4000 }
4001 return [info]
4002
4003 class TEDIE(InfoExtractor):
4004 _VALID_URL=r'''http://www.ted.com/
4005 (
4006 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4007 |
4008 ((?P<type_talk>talks)) # We have a simple talk
4009 )
4010 /(?P<name>\w+) # Here goes the name and then ".html"
4011 '''
4012
4013 @classmethod
4014 def suitable(cls, url):
4015 """Receives a URL and returns True if suitable for this IE."""
4016 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4017
4018 def _real_extract(self, url):
4019 m=re.match(self._VALID_URL, url, re.VERBOSE)
4020 if m.group('type_talk'):
4021 return [self._talk_info(url)]
4022 else :
4023 playlist_id=m.group('playlist_id')
4024 name=m.group('name')
4025 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4026 return self._playlist_videos_info(url,name,playlist_id)
4027
4028 def _talk_video_link(self,mediaSlug):
4029 '''Returns the video link for that mediaSlug'''
4030 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4031
4032 def _playlist_videos_info(self,url,name,playlist_id=0):
4033 '''Returns the videos of the playlist'''
4034 video_RE=r'''
4035 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4036 ([.\s]*?)data-playlist_item_id="(\d+)"
4037 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4038 '''
4039 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4040 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4041 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4042 m_names=re.finditer(video_name_RE,webpage)
4043 info=[]
4044 for m_video, m_name in zip(m_videos,m_names):
4045 video_id=m_video.group('video_id')
4046 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4047 info.append(self._talk_info(talk_url,video_id))
4048 return info
4049
4050 def _talk_info(self, url, video_id=0):
4051 """Return the video for the talk in the url"""
4052 m=re.match(self._VALID_URL, url,re.VERBOSE)
4053 videoName=m.group('name')
4054 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4055 # If the url includes the language we get the title translated
4056 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4057 title=re.search(title_RE, webpage).group('title')
4058 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4059 "id":(?P<videoID>[\d]+).*?
4060 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4061 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4062 thumb_match=re.search(thumb_RE,webpage)
4063 info_match=re.search(info_RE,webpage,re.VERBOSE)
4064 video_id=info_match.group('videoID')
4065 mediaSlug=info_match.group('mediaSlug')
4066 video_url=self._talk_video_link(mediaSlug)
4067 info = {
4068 'id': video_id,
4069 'url': video_url,
4070 'ext': 'mp4',
4071 'title': title,
4072 'thumbnail': thumb_match.group('thumbnail')
4073 }
4074 return info
4075
4076 class MySpassIE(InfoExtractor):
4077 _VALID_URL = r'http://www.myspass.de/.*'
4078
4079 def _real_extract(self, url):
4080 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4081
4082 # video id is the last path element of the URL
4083 # usually there is a trailing slash, so also try the second but last
4084 url_path = compat_urllib_parse_urlparse(url).path
4085 url_parent_path, video_id = os.path.split(url_path)
4086 if not video_id:
4087 _, video_id = os.path.split(url_parent_path)
4088
4089 # get metadata
4090 metadata_url = META_DATA_URL_TEMPLATE % video_id
4091 metadata_text = self._download_webpage(metadata_url, video_id)
4092 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4093
4094 # extract values from metadata
4095 url_flv_el = metadata.find('url_flv')
4096 if url_flv_el is None:
4097 self._downloader.trouble(u'ERROR: unable to extract download url')
4098 return
4099 video_url = url_flv_el.text
4100 extension = os.path.splitext(video_url)[1][1:]
4101 title_el = metadata.find('title')
4102 if title_el is None:
4103 self._downloader.trouble(u'ERROR: unable to extract title')
4104 return
4105 title = title_el.text
4106 format_id_el = metadata.find('format_id')
4107 if format_id_el is None:
4108 format = ext
4109 else:
4110 format = format_id_el.text
4111 description_el = metadata.find('description')
4112 if description_el is not None:
4113 description = description_el.text
4114 else:
4115 description = None
4116 imagePreview_el = metadata.find('imagePreview')
4117 if imagePreview_el is not None:
4118 thumbnail = imagePreview_el.text
4119 else:
4120 thumbnail = None
4121 info = {
4122 'id': video_id,
4123 'url': video_url,
4124 'title': title,
4125 'ext': extension,
4126 'format': format,
4127 'thumbnail': thumbnail,
4128 'description': description
4129 }
4130 return [info]
4131
4132 def gen_extractors():
4133 """ Return a list of an instance of every supported extractor.
4134 The order does matter; the first extractor matched is the one handling the URL.
4135 """
4136 return [
4137 YoutubePlaylistIE(),
4138 YoutubeChannelIE(),
4139 YoutubeUserIE(),
4140 YoutubeSearchIE(),
4141 YoutubeIE(),
4142 MetacafeIE(),
4143 DailymotionIE(),
4144 GoogleSearchIE(),
4145 PhotobucketIE(),
4146 YahooIE(),
4147 YahooSearchIE(),
4148 DepositFilesIE(),
4149 FacebookIE(),
4150 BlipTVUserIE(),
4151 BlipTVIE(),
4152 VimeoIE(),
4153 MyVideoIE(),
4154 ComedyCentralIE(),
4155 EscapistIE(),
4156 CollegeHumorIE(),
4157 XVideosIE(),
4158 SoundcloudIE(),
4159 InfoQIE(),
4160 MixcloudIE(),
4161 StanfordOpenClassroomIE(),
4162 MTVIE(),
4163 YoukuIE(),
4164 XNXXIE(),
4165 YouJizzIE(),
4166 PornotubeIE(),
4167 YouPornIE(),
4168 GooglePlusIE(),
4169 ArteTvIE(),
4170 NBAIE(),
4171 JustinTVIE(),
4172 FunnyOrDieIE(),
4173 TweetReelIE(),
4174 SteamIE(),
4175 UstreamIE(),
4176 RBMARadioIE(),
4177 EightTracksIE(),
4178 KeekIE(),
4179 TEDIE(),
4180 MySpassIE(),
4181 GenericIE()
4182 ]
4183
4184