]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Added new option '--all-srt' to download all the subtitles of a video.
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23 """Information Extractor class.
24
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
32
33 The dictionaries must include the following fields:
34
35 id: Video identifier.
36 url: Final video URL.
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
53
54 The fields should all be Unicode strings.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59
60 _real_extract() must return a *list* of information dictionaries as
61 described above.
62
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
65 """
66
67 _ready = False
68 _downloader = None
69 _WORKING = True
70
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
73 self._ready = False
74 self.set_downloader(downloader)
75
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
79
80 def working(self):
81 """Getter method for _WORKING."""
82 return self._WORKING
83
84 def initialize(self):
85 """Initializes an instance (authentication, etc)."""
86 if not self._ready:
87 self._real_initialize()
88 self._ready = True
89
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
92 self.initialize()
93 return self._real_extract(url)
94
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
98
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
101 pass
102
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
105 pass
106
107 @property
108 def IE_NAME(self):
109 return type(self).__name__[:-2]
110
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
113 if note is None:
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116 try:
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 if errnote is None:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
132
133 _VALID_URL = r"""^
134 (
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 v=
147 )
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
152 $"""
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
162 '13': '3gp',
163 '17': 'mp4',
164 '18': 'mp4',
165 '22': 'mp4',
166 '37': 'mp4',
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168 '43': 'webm',
169 '44': 'webm',
170 '45': 'webm',
171 '46': 'webm',
172 }
173 _video_dimensions = {
174 '5': '240x400',
175 '6': '???',
176 '13': '???',
177 '17': '144x176',
178 '18': '360x640',
179 '22': '720x1280',
180 '34': '360x640',
181 '35': '480x854',
182 '37': '1080x1920',
183 '38': '3072x4096',
184 '43': '360x640',
185 '44': '480x854',
186 '45': '720x1280',
187 '46': '1080x1920',
188 }
189 IE_NAME = u'youtube'
190
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
198
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
202
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
206
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219 def report_video_subtitles_request(self, video_id, lang):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
222
223 def report_information_extraction(self, video_id):
224 """Report attempt to extract video information."""
225 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227 def report_unavailable_format(self, video_id, format):
228 """Report extracted video URL."""
229 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231 def report_rtmp_download(self):
232 """Indicate the download will use the RTMP protocol."""
233 self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235 def _get_available_subtitles(self, video_id):
236 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
237 try:
238 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
239 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
240 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
241 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
242 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
243 if not srt_lang_list:
244 return (u'WARNING: video has no closed captions', None)
245 return srt_lang_list
246
247 def _request_subtitle(self, str_lang, str_name, video_id, format = 'srt'):
248 self.report_video_subtitles_request(video_id, str_lang)
249 params = compat_urllib_parse.urlencode({
250 'lang': str_lang,
251 'name': str_name,
252 'v': video_id,
253 'fmt': format,
254 })
255 url = 'http://www.youtube.com/api/timedtext?' + params
256 try:
257 srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
258 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
259 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
260 if not srt:
261 return (u'WARNING: Did not fetch video subtitles', None)
262 return (None, str_lang, srt)
263
264 def _extract_subtitle(self, video_id):
265 self.report_video_subtitles_download(video_id)
266 srt_lang_list = self._get_available_subtitles(video_id)
267
268 if self._downloader.params.get('subtitleslang', False):
269 srt_lang = self._downloader.params.get('subtitleslang')
270 elif 'en' in srt_lang_list:
271 srt_lang = 'en'
272 else:
273 srt_lang = list(srt_lang_list.keys())[0]
274 if not srt_lang in srt_lang_list:
275 return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None)
276
277 sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
278 return [sub]
279
280 def _extract_all_subtitles(self, video_id):
281 self.report_video_subtitles_download(video_id)
282 srt_lang_list = self._get_available_subtitles(video_id)
283 subs = []
284 for srt_lang in srt_lang_list:
285 sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
286 subs.append(sub)
287 return subs
288
289 def _print_formats(self, formats):
290 print('Available formats:')
291 for x in formats:
292 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
293
294 def _real_initialize(self):
295 if self._downloader is None:
296 return
297
298 username = None
299 password = None
300 downloader_params = self._downloader.params
301
302 # Attempt to use provided username and password or .netrc data
303 if downloader_params.get('username', None) is not None:
304 username = downloader_params['username']
305 password = downloader_params['password']
306 elif downloader_params.get('usenetrc', False):
307 try:
308 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
309 if info is not None:
310 username = info[0]
311 password = info[2]
312 else:
313 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
314 except (IOError, netrc.NetrcParseError) as err:
315 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
316 return
317
318 # Set language
319 request = compat_urllib_request.Request(self._LANG_URL)
320 try:
321 self.report_lang()
322 compat_urllib_request.urlopen(request).read()
323 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
324 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
325 return
326
327 # No authentication to be performed
328 if username is None:
329 return
330
331 request = compat_urllib_request.Request(self._LOGIN_URL)
332 try:
333 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
334 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
335 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
336 return
337
338 galx = None
339 dsh = None
340 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
341 if match:
342 galx = match.group(1)
343
344 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
345 if match:
346 dsh = match.group(1)
347
348 # Log in
349 login_form_strs = {
350 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
351 u'Email': username,
352 u'GALX': galx,
353 u'Passwd': password,
354 u'PersistentCookie': u'yes',
355 u'_utf8': u'霱',
356 u'bgresponse': u'js_disabled',
357 u'checkConnection': u'',
358 u'checkedDomains': u'youtube',
359 u'dnConn': u'',
360 u'dsh': dsh,
361 u'pstMsg': u'0',
362 u'rmShown': u'1',
363 u'secTok': u'',
364 u'signIn': u'Sign in',
365 u'timeStmp': u'',
366 u'service': u'youtube',
367 u'uilel': u'3',
368 u'hl': u'en_US',
369 }
370 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
371 # chokes on unicode
372 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
373 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
374 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
375 try:
376 self.report_login()
377 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
378 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
379 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
380 return
381 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
382 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
383 return
384
385 # Confirm age
386 age_form = {
387 'next_url': '/',
388 'action_confirm': 'Confirm',
389 }
390 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
391 try:
392 self.report_age_confirmation()
393 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
396 return
397
398 def _extract_id(self, url):
399 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
400 if mobj is None:
401 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
402 return
403 video_id = mobj.group(2)
404 return video_id
405
406 def _real_extract(self, url):
407 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
408 mobj = re.search(self._NEXT_URL_RE, url)
409 if mobj:
410 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
411 video_id = self._extract_id(url)
412
413 # Get video webpage
414 self.report_video_webpage_download(video_id)
415 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
416 request = compat_urllib_request.Request(url)
417 try:
418 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
419 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
420 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
421 return
422
423 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
424
425 # Attempt to extract SWF player URL
426 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
427 if mobj is not None:
428 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
429 else:
430 player_url = None
431
432 # Get video info
433 self.report_video_info_webpage_download(video_id)
434 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
435 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
436 % (video_id, el_type))
437 request = compat_urllib_request.Request(video_info_url)
438 try:
439 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
440 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
441 video_info = compat_parse_qs(video_info_webpage)
442 if 'token' in video_info:
443 break
444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
446 return
447 if 'token' not in video_info:
448 if 'reason' in video_info:
449 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
450 else:
451 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
452 return
453
454 # Check for "rental" videos
455 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: "rental" videos not supported')
457 return
458
459 # Start extracting information
460 self.report_information_extraction(video_id)
461
462 # uploader
463 if 'author' not in video_info:
464 self._downloader.trouble(u'ERROR: unable to extract uploader name')
465 return
466 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
467
468 # uploader_id
469 video_uploader_id = None
470 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
471 if mobj is not None:
472 video_uploader_id = mobj.group(1)
473 else:
474 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
475
476 # title
477 if 'title' not in video_info:
478 self._downloader.trouble(u'ERROR: unable to extract video title')
479 return
480 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
481
482 # thumbnail image
483 if 'thumbnail_url' not in video_info:
484 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
485 video_thumbnail = ''
486 else: # don't panic if we can't find it
487 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
488
489 # upload date
490 upload_date = None
491 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
492 if mobj is not None:
493 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
494 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
495 for expression in format_expressions:
496 try:
497 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
498 except:
499 pass
500
501 # description
502 video_description = get_element_by_id("eow-description", video_webpage)
503 if video_description:
504 video_description = clean_html(video_description)
505 else:
506 video_description = ''
507
508 # closed captions
509 video_subtitles = None
510
511 if self._downloader.params.get('writesubtitles', False):
512 video_subtitles = self._extract_subtitle(video_id)
513 if video_subtitles:
514 (srt_error, srt_lang, srt) = video_subtitles[0]
515 if srt_error:
516 self._downloader.trouble(srt_error)
517
518 if self._downloader.params.get('allsubtitles', False):
519 video_subtitles = self._extract_all_subtitles(video_id)
520 for video_subtitle in video_subtitles:
521 (srt_error, srt_lang, srt) = video_subtitle
522 if srt_error:
523 self._downloader.trouble(srt_error)
524
525 if 'length_seconds' not in video_info:
526 self._downloader.trouble(u'WARNING: unable to extract video duration')
527 video_duration = ''
528 else:
529 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
530
531 # token
532 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
533
534 # Decide which formats to download
535 req_format = self._downloader.params.get('format', None)
536
537 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
538 self.report_rtmp_download()
539 video_url_list = [(None, video_info['conn'][0])]
540 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
541 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
542 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
543 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
544 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
545
546 format_limit = self._downloader.params.get('format_limit', None)
547 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
548 if format_limit is not None and format_limit in available_formats:
549 format_list = available_formats[available_formats.index(format_limit):]
550 else:
551 format_list = available_formats
552 existing_formats = [x for x in format_list if x in url_map]
553 if len(existing_formats) == 0:
554 self._downloader.trouble(u'ERROR: no known formats available for video')
555 return
556 if self._downloader.params.get('listformats', None):
557 self._print_formats(existing_formats)
558 return
559 if req_format is None or req_format == 'best':
560 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
561 elif req_format == 'worst':
562 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
563 elif req_format in ('-1', 'all'):
564 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
565 else:
566 # Specific formats. We pick the first in a slash-delimeted sequence.
567 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
568 req_formats = req_format.split('/')
569 video_url_list = None
570 for rf in req_formats:
571 if rf in url_map:
572 video_url_list = [(rf, url_map[rf])]
573 break
574 if video_url_list is None:
575 self._downloader.trouble(u'ERROR: requested format not available')
576 return
577 else:
578 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
579 return
580
581 results = []
582 for format_param, video_real_url in video_url_list:
583 # Extension
584 video_extension = self._video_extensions.get(format_param, 'flv')
585
586 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
587 self._video_dimensions.get(format_param, '???'))
588
589 results.append({
590 'id': video_id,
591 'url': video_real_url,
592 'uploader': video_uploader,
593 'uploader_id': video_uploader_id,
594 'upload_date': upload_date,
595 'title': video_title,
596 'ext': video_extension,
597 'format': video_format,
598 'thumbnail': video_thumbnail,
599 'description': video_description,
600 'player_url': player_url,
601 'subtitles': video_subtitles,
602 'duration': video_duration
603 })
604 return results
605
606
607 class MetacafeIE(InfoExtractor):
608 """Information Extractor for metacafe.com."""
609
610 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
611 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
612 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
613 IE_NAME = u'metacafe'
614
615 def __init__(self, downloader=None):
616 InfoExtractor.__init__(self, downloader)
617
618 def report_disclaimer(self):
619 """Report disclaimer retrieval."""
620 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
621
622 def report_age_confirmation(self):
623 """Report attempt to confirm age."""
624 self._downloader.to_screen(u'[metacafe] Confirming age')
625
626 def report_download_webpage(self, video_id):
627 """Report webpage download."""
628 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
629
630 def report_extraction(self, video_id):
631 """Report information extraction."""
632 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
633
634 def _real_initialize(self):
635 # Retrieve disclaimer
636 request = compat_urllib_request.Request(self._DISCLAIMER)
637 try:
638 self.report_disclaimer()
639 disclaimer = compat_urllib_request.urlopen(request).read()
640 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
641 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
642 return
643
644 # Confirm age
645 disclaimer_form = {
646 'filters': '0',
647 'submit': "Continue - I'm over 18",
648 }
649 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
650 try:
651 self.report_age_confirmation()
652 disclaimer = compat_urllib_request.urlopen(request).read()
653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
654 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
655 return
656
657 def _real_extract(self, url):
658 # Extract id and simplified title from URL
659 mobj = re.match(self._VALID_URL, url)
660 if mobj is None:
661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
662 return
663
664 video_id = mobj.group(1)
665
666 # Check if video comes from YouTube
667 mobj2 = re.match(r'^yt-(.*)$', video_id)
668 if mobj2 is not None:
669 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
670 return
671
672 # Retrieve video webpage to extract further information
673 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
674 try:
675 self.report_download_webpage(video_id)
676 webpage = compat_urllib_request.urlopen(request).read()
677 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
678 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
679 return
680
681 # Extract URL, uploader and title from webpage
682 self.report_extraction(video_id)
683 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
684 if mobj is not None:
685 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
686 video_extension = mediaURL[-3:]
687
688 # Extract gdaKey if available
689 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
690 if mobj is None:
691 video_url = mediaURL
692 else:
693 gdaKey = mobj.group(1)
694 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
695 else:
696 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
697 if mobj is None:
698 self._downloader.trouble(u'ERROR: unable to extract media URL')
699 return
700 vardict = compat_parse_qs(mobj.group(1))
701 if 'mediaData' not in vardict:
702 self._downloader.trouble(u'ERROR: unable to extract media URL')
703 return
704 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
705 if mobj is None:
706 self._downloader.trouble(u'ERROR: unable to extract media URL')
707 return
708 mediaURL = mobj.group(1).replace('\\/', '/')
709 video_extension = mediaURL[-3:]
710 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
711
712 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
713 if mobj is None:
714 self._downloader.trouble(u'ERROR: unable to extract title')
715 return
716 video_title = mobj.group(1).decode('utf-8')
717
718 mobj = re.search(r'submitter=(.*?);', webpage)
719 if mobj is None:
720 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
721 return
722 video_uploader = mobj.group(1)
723
724 return [{
725 'id': video_id.decode('utf-8'),
726 'url': video_url.decode('utf-8'),
727 'uploader': video_uploader.decode('utf-8'),
728 'upload_date': None,
729 'title': video_title,
730 'ext': video_extension.decode('utf-8'),
731 }]
732
733
734 class DailymotionIE(InfoExtractor):
735 """Information Extractor for Dailymotion"""
736
737 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
738 IE_NAME = u'dailymotion'
739 _WORKING = False
740
741 def __init__(self, downloader=None):
742 InfoExtractor.__init__(self, downloader)
743
744 def report_extraction(self, video_id):
745 """Report information extraction."""
746 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
747
748 def _real_extract(self, url):
749 # Extract id and simplified title from URL
750 mobj = re.match(self._VALID_URL, url)
751 if mobj is None:
752 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
753 return
754
755 video_id = mobj.group(1).split('_')[0].split('?')[0]
756
757 video_extension = 'mp4'
758
759 # Retrieve video webpage to extract further information
760 request = compat_urllib_request.Request(url)
761 request.add_header('Cookie', 'family_filter=off')
762 webpage = self._download_webpage(request, video_id)
763
764 # Extract URL, uploader and title from webpage
765 self.report_extraction(video_id)
766 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
767 if mobj is None:
768 self._downloader.trouble(u'ERROR: unable to extract media URL')
769 return
770 flashvars = compat_urllib_parse.unquote(mobj.group(1))
771
772 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
773 if key in flashvars:
774 max_quality = key
775 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
776 break
777 else:
778 self._downloader.trouble(u'ERROR: unable to extract video URL')
779 return
780
781 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
782 if mobj is None:
783 self._downloader.trouble(u'ERROR: unable to extract video URL')
784 return
785
786 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
787
788 # TODO: support choosing qualities
789
790 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
791 if mobj is None:
792 self._downloader.trouble(u'ERROR: unable to extract title')
793 return
794 video_title = unescapeHTML(mobj.group('title'))
795
796 video_uploader = None
797 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
798 if mobj is None:
799 # lookin for official user
800 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
801 if mobj_official is None:
802 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
803 else:
804 video_uploader = mobj_official.group(1)
805 else:
806 video_uploader = mobj.group(1)
807
808 video_upload_date = None
809 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
810 if mobj is not None:
811 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
812
813 return [{
814 'id': video_id,
815 'url': video_url,
816 'uploader': video_uploader,
817 'upload_date': video_upload_date,
818 'title': video_title,
819 'ext': video_extension,
820 }]
821
822
823 class PhotobucketIE(InfoExtractor):
824 """Information extractor for photobucket.com."""
825
826 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
827 IE_NAME = u'photobucket'
828
829 def __init__(self, downloader=None):
830 InfoExtractor.__init__(self, downloader)
831
832 def report_download_webpage(self, video_id):
833 """Report webpage download."""
834 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
835
836 def report_extraction(self, video_id):
837 """Report information extraction."""
838 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
839
840 def _real_extract(self, url):
841 # Extract id from URL
842 mobj = re.match(self._VALID_URL, url)
843 if mobj is None:
844 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
845 return
846
847 video_id = mobj.group(1)
848
849 video_extension = 'flv'
850
851 # Retrieve video webpage to extract further information
852 request = compat_urllib_request.Request(url)
853 try:
854 self.report_download_webpage(video_id)
855 webpage = compat_urllib_request.urlopen(request).read()
856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
857 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
858 return
859
860 # Extract URL, uploader, and title from webpage
861 self.report_extraction(video_id)
862 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
863 if mobj is None:
864 self._downloader.trouble(u'ERROR: unable to extract media URL')
865 return
866 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
867
868 video_url = mediaURL
869
870 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
871 if mobj is None:
872 self._downloader.trouble(u'ERROR: unable to extract title')
873 return
874 video_title = mobj.group(1).decode('utf-8')
875
876 video_uploader = mobj.group(2).decode('utf-8')
877
878 return [{
879 'id': video_id.decode('utf-8'),
880 'url': video_url.decode('utf-8'),
881 'uploader': video_uploader,
882 'upload_date': None,
883 'title': video_title,
884 'ext': video_extension.decode('utf-8'),
885 }]
886
887
888 class YahooIE(InfoExtractor):
889 """Information extractor for video.yahoo.com."""
890
891 _WORKING = False
892 # _VALID_URL matches all Yahoo! Video URLs
893 # _VPAGE_URL matches only the extractable '/watch/' URLs
894 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
895 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
896 IE_NAME = u'video.yahoo'
897
898 def __init__(self, downloader=None):
899 InfoExtractor.__init__(self, downloader)
900
901 def report_download_webpage(self, video_id):
902 """Report webpage download."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
904
905 def report_extraction(self, video_id):
906 """Report information extraction."""
907 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
908
909 def _real_extract(self, url, new_video=True):
910 # Extract ID from URL
911 mobj = re.match(self._VALID_URL, url)
912 if mobj is None:
913 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914 return
915
916 video_id = mobj.group(2)
917 video_extension = 'flv'
918
919 # Rewrite valid but non-extractable URLs as
920 # extractable English language /watch/ URLs
921 if re.match(self._VPAGE_URL, url) is None:
922 request = compat_urllib_request.Request(url)
923 try:
924 webpage = compat_urllib_request.urlopen(request).read()
925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
926 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
927 return
928
929 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
930 if mobj is None:
931 self._downloader.trouble(u'ERROR: Unable to extract id field')
932 return
933 yahoo_id = mobj.group(1)
934
935 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
936 if mobj is None:
937 self._downloader.trouble(u'ERROR: Unable to extract vid field')
938 return
939 yahoo_vid = mobj.group(1)
940
941 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
942 return self._real_extract(url, new_video=False)
943
944 # Retrieve video webpage to extract further information
945 request = compat_urllib_request.Request(url)
946 try:
947 self.report_download_webpage(video_id)
948 webpage = compat_urllib_request.urlopen(request).read()
949 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
950 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951 return
952
953 # Extract uploader and title from webpage
954 self.report_extraction(video_id)
955 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
956 if mobj is None:
957 self._downloader.trouble(u'ERROR: unable to extract video title')
958 return
959 video_title = mobj.group(1).decode('utf-8')
960
961 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
962 if mobj is None:
963 self._downloader.trouble(u'ERROR: unable to extract video uploader')
964 return
965 video_uploader = mobj.group(1).decode('utf-8')
966
967 # Extract video thumbnail
968 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
969 if mobj is None:
970 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
971 return
972 video_thumbnail = mobj.group(1).decode('utf-8')
973
974 # Extract video description
975 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
976 if mobj is None:
977 self._downloader.trouble(u'ERROR: unable to extract video description')
978 return
979 video_description = mobj.group(1).decode('utf-8')
980 if not video_description:
981 video_description = 'No description available.'
982
983 # Extract video height and width
984 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
985 if mobj is None:
986 self._downloader.trouble(u'ERROR: unable to extract video height')
987 return
988 yv_video_height = mobj.group(1)
989
990 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
991 if mobj is None:
992 self._downloader.trouble(u'ERROR: unable to extract video width')
993 return
994 yv_video_width = mobj.group(1)
995
996 # Retrieve video playlist to extract media URL
997 # I'm not completely sure what all these options are, but we
998 # seem to need most of them, otherwise the server sends a 401.
999 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1000 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1001 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1002 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1003 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1004 try:
1005 self.report_download_webpage(video_id)
1006 webpage = compat_urllib_request.urlopen(request).read()
1007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009 return
1010
1011 # Extract media URL from playlist XML
1012 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1013 if mobj is None:
1014 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1015 return
1016 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1017 video_url = unescapeHTML(video_url)
1018
1019 return [{
1020 'id': video_id.decode('utf-8'),
1021 'url': video_url,
1022 'uploader': video_uploader,
1023 'upload_date': None,
1024 'title': video_title,
1025 'ext': video_extension.decode('utf-8'),
1026 'thumbnail': video_thumbnail.decode('utf-8'),
1027 'description': video_description,
1028 }]
1029
1030
1031 class VimeoIE(InfoExtractor):
1032 """Information extractor for vimeo.com."""
1033
1034 # _VALID_URL matches Vimeo URLs
1035 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1036 IE_NAME = u'vimeo'
1037
1038 def __init__(self, downloader=None):
1039 InfoExtractor.__init__(self, downloader)
1040
1041 def report_download_webpage(self, video_id):
1042 """Report webpage download."""
1043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1044
1045 def report_extraction(self, video_id):
1046 """Report information extraction."""
1047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1048
1049 def _real_extract(self, url, new_video=True):
1050 # Extract ID from URL
1051 mobj = re.match(self._VALID_URL, url)
1052 if mobj is None:
1053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054 return
1055
1056 video_id = mobj.group('id')
1057 if not mobj.group('proto'):
1058 url = 'https://' + url
1059 if mobj.group('direct_link'):
1060 url = 'https://vimeo.com/' + video_id
1061
1062 # Retrieve video webpage to extract further information
1063 request = compat_urllib_request.Request(url, None, std_headers)
1064 try:
1065 self.report_download_webpage(video_id)
1066 webpage_bytes = compat_urllib_request.urlopen(request).read()
1067 webpage = webpage_bytes.decode('utf-8')
1068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1069 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1070 return
1071
1072 # Now we begin extracting as much information as we can from what we
1073 # retrieved. First we extract the information common to all extractors,
1074 # and latter we extract those that are Vimeo specific.
1075 self.report_extraction(video_id)
1076
1077 # Extract the config JSON
1078 try:
1079 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1080 config = json.loads(config)
1081 except:
1082 self._downloader.trouble(u'ERROR: unable to extract info section')
1083 return
1084
1085 # Extract title
1086 video_title = config["video"]["title"]
1087
1088 # Extract uploader and uploader_id
1089 video_uploader = config["video"]["owner"]["name"]
1090 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1091
1092 # Extract video thumbnail
1093 video_thumbnail = config["video"]["thumbnail"]
1094
1095 # Extract video description
1096 video_description = get_element_by_attribute("itemprop", "description", webpage)
1097 if video_description: video_description = clean_html(video_description)
1098 else: video_description = ''
1099
1100 # Extract upload date
1101 video_upload_date = None
1102 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1103 if mobj is not None:
1104 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1105
1106 # Vimeo specific: extract request signature and timestamp
1107 sig = config['request']['signature']
1108 timestamp = config['request']['timestamp']
1109
1110 # Vimeo specific: extract video codec and quality information
1111 # First consider quality, then codecs, then take everything
1112 # TODO bind to format param
1113 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1114 files = { 'hd': [], 'sd': [], 'other': []}
1115 for codec_name, codec_extension in codecs:
1116 if codec_name in config["video"]["files"]:
1117 if 'hd' in config["video"]["files"][codec_name]:
1118 files['hd'].append((codec_name, codec_extension, 'hd'))
1119 elif 'sd' in config["video"]["files"][codec_name]:
1120 files['sd'].append((codec_name, codec_extension, 'sd'))
1121 else:
1122 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1123
1124 for quality in ('hd', 'sd', 'other'):
1125 if len(files[quality]) > 0:
1126 video_quality = files[quality][0][2]
1127 video_codec = files[quality][0][0]
1128 video_extension = files[quality][0][1]
1129 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1130 break
1131 else:
1132 self._downloader.trouble(u'ERROR: no known codec found')
1133 return
1134
1135 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1136 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1137
1138 return [{
1139 'id': video_id,
1140 'url': video_url,
1141 'uploader': video_uploader,
1142 'uploader_id': video_uploader_id,
1143 'upload_date': video_upload_date,
1144 'title': video_title,
1145 'ext': video_extension,
1146 'thumbnail': video_thumbnail,
1147 'description': video_description,
1148 }]
1149
1150
1151 class ArteTvIE(InfoExtractor):
1152 """arte.tv information extractor."""
1153
1154 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1155 _LIVE_URL = r'index-[0-9]+\.html$'
1156
1157 IE_NAME = u'arte.tv'
1158
1159 def __init__(self, downloader=None):
1160 InfoExtractor.__init__(self, downloader)
1161
1162 def report_download_webpage(self, video_id):
1163 """Report webpage download."""
1164 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1165
1166 def report_extraction(self, video_id):
1167 """Report information extraction."""
1168 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1169
1170 def fetch_webpage(self, url):
1171 request = compat_urllib_request.Request(url)
1172 try:
1173 self.report_download_webpage(url)
1174 webpage = compat_urllib_request.urlopen(request).read()
1175 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1177 return
1178 except ValueError as err:
1179 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180 return
1181 return webpage
1182
1183 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184 page = self.fetch_webpage(url)
1185 mobj = re.search(regex, page, regexFlags)
1186 info = {}
1187
1188 if mobj is None:
1189 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190 return
1191
1192 for (i, key, err) in matchTuples:
1193 if mobj.group(i) is None:
1194 self._downloader.trouble(err)
1195 return
1196 else:
1197 info[key] = mobj.group(i)
1198
1199 return info
1200
1201 def extractLiveStream(self, url):
1202 video_lang = url.split('/')[-4]
1203 info = self.grep_webpage(
1204 url,
1205 r'src="(.*?/videothek_js.*?\.js)',
1206 0,
1207 [
1208 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1209 ]
1210 )
1211 http_host = url.split('/')[2]
1212 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1213 info = self.grep_webpage(
1214 next_url,
1215 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216 '(http://.*?\.swf).*?' +
1217 '(rtmp://.*?)\'',
1218 re.DOTALL,
1219 [
1220 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1221 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1222 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1223 ]
1224 )
1225 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1226
1227 def extractPlus7Stream(self, url):
1228 video_lang = url.split('/')[-3]
1229 info = self.grep_webpage(
1230 url,
1231 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232 0,
1233 [
1234 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1235 ]
1236 )
1237 next_url = compat_urllib_parse.unquote(info.get('url'))
1238 info = self.grep_webpage(
1239 next_url,
1240 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241 0,
1242 [
1243 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1244 ]
1245 )
1246 next_url = compat_urllib_parse.unquote(info.get('url'))
1247
1248 info = self.grep_webpage(
1249 next_url,
1250 r'<video id="(.*?)".*?>.*?' +
1251 '<name>(.*?)</name>.*?' +
1252 '<dateVideo>(.*?)</dateVideo>.*?' +
1253 '<url quality="hd">(.*?)</url>',
1254 re.DOTALL,
1255 [
1256 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1257 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1258 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1259 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1260 ]
1261 )
1262
1263 return {
1264 'id': info.get('id'),
1265 'url': compat_urllib_parse.unquote(info.get('url')),
1266 'uploader': u'arte.tv',
1267 'upload_date': info.get('date'),
1268 'title': info.get('title').decode('utf-8'),
1269 'ext': u'mp4',
1270 'format': u'NA',
1271 'player_url': None,
1272 }
1273
1274 def _real_extract(self, url):
1275 video_id = url.split('/')[-1]
1276 self.report_extraction(video_id)
1277
1278 if re.search(self._LIVE_URL, video_id) is not None:
1279 self.extractLiveStream(url)
1280 return
1281 else:
1282 info = self.extractPlus7Stream(url)
1283
1284 return [info]
1285
1286
1287 class GenericIE(InfoExtractor):
1288 """Generic last-resort information extractor."""
1289
1290 _VALID_URL = r'.*'
1291 IE_NAME = u'generic'
1292
1293 def __init__(self, downloader=None):
1294 InfoExtractor.__init__(self, downloader)
1295
1296 def report_download_webpage(self, video_id):
1297 """Report webpage download."""
1298 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1299 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1300
1301 def report_extraction(self, video_id):
1302 """Report information extraction."""
1303 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1304
1305 def report_following_redirect(self, new_url):
1306 """Report information extraction."""
1307 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309 def _test_redirect(self, url):
1310 """Check if it is a redirect, like url shorteners, in case restart chain."""
1311 class HeadRequest(compat_urllib_request.Request):
1312 def get_method(self):
1313 return "HEAD"
1314
1315 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316 """
1317 Subclass the HTTPRedirectHandler to make it use our
1318 HeadRequest also on the redirected URL
1319 """
1320 def redirect_request(self, req, fp, code, msg, headers, newurl):
1321 if code in (301, 302, 303, 307):
1322 newurl = newurl.replace(' ', '%20')
1323 newheaders = dict((k,v) for k,v in req.headers.items()
1324 if k.lower() not in ("content-length", "content-type"))
1325 return HeadRequest(newurl,
1326 headers=newheaders,
1327 origin_req_host=req.get_origin_req_host(),
1328 unverifiable=True)
1329 else:
1330 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333 """
1334 Fallback to GET if HEAD is not allowed (405 HTTP error)
1335 """
1336 def http_error_405(self, req, fp, code, msg, headers):
1337 fp.read()
1338 fp.close()
1339
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343 headers=newheaders,
1344 origin_req_host=req.get_origin_req_host(),
1345 unverifiable=True))
1346
1347 # Build our opener
1348 opener = compat_urllib_request.OpenerDirector()
1349 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350 HTTPMethodFallback, HEADRedirectHandler,
1351 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352 opener.add_handler(handler())
1353
1354 response = opener.open(HeadRequest(url))
1355 new_url = response.geturl()
1356
1357 if url == new_url:
1358 return False
1359
1360 self.report_following_redirect(new_url)
1361 self._downloader.download([new_url])
1362 return True
1363
1364 def _real_extract(self, url):
1365 if self._test_redirect(url): return
1366
1367 video_id = url.split('/')[-1]
1368 request = compat_urllib_request.Request(url)
1369 try:
1370 self.report_download_webpage(video_id)
1371 webpage = compat_urllib_request.urlopen(request).read()
1372 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1373 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1374 return
1375 except ValueError as err:
1376 # since this is the last-resort InfoExtractor, if
1377 # this error is thrown, it'll be thrown here
1378 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379 return
1380
1381 self.report_extraction(video_id)
1382 # Start with something easy: JW Player in SWFObject
1383 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1384 if mobj is None:
1385 # Broaden the search a little bit
1386 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1387 if mobj is None:
1388 # Broaden the search a little bit: JWPlayer JS loader
1389 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1390 if mobj is None:
1391 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1392 return
1393
1394 # It's possible that one of the regexes
1395 # matched, but returned an empty group:
1396 if mobj.group(1) is None:
1397 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1398 return
1399
1400 video_url = compat_urllib_parse.unquote(mobj.group(1))
1401 video_id = os.path.basename(video_url)
1402
1403 # here's a fun little line of code for you:
1404 video_extension = os.path.splitext(video_id)[1][1:]
1405 video_id = os.path.splitext(video_id)[0]
1406
1407 # it's tempting to parse this further, but you would
1408 # have to take into account all the variations like
1409 # Video Title - Site Name
1410 # Site Name | Video Title
1411 # Video Title - Tagline | Site Name
1412 # and so on and so forth; it's just not practical
1413 mobj = re.search(r'<title>(.*)</title>', webpage)
1414 if mobj is None:
1415 self._downloader.trouble(u'ERROR: unable to extract title')
1416 return
1417 video_title = mobj.group(1)
1418
1419 # video uploader is domain name
1420 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1421 if mobj is None:
1422 self._downloader.trouble(u'ERROR: unable to extract title')
1423 return
1424 video_uploader = mobj.group(1)
1425
1426 return [{
1427 'id': video_id,
1428 'url': video_url,
1429 'uploader': video_uploader,
1430 'upload_date': None,
1431 'title': video_title,
1432 'ext': video_extension,
1433 }]
1434
1435
1436 class YoutubeSearchIE(InfoExtractor):
1437 """Information Extractor for YouTube search queries."""
1438 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1439 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1440 _max_youtube_results = 1000
1441 IE_NAME = u'youtube:search'
1442
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1445
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download search page with given number."""
1448 query = query.decode(preferredencoding())
1449 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1450
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1453 if mobj is None:
1454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455 return
1456
1457 prefix, query = query.split(':')
1458 prefix = prefix[8:]
1459 query = query.encode('utf-8')
1460 if prefix == '':
1461 self._download_n_results(query, 1)
1462 return
1463 elif prefix == 'all':
1464 self._download_n_results(query, self._max_youtube_results)
1465 return
1466 else:
1467 try:
1468 n = int(prefix)
1469 if n <= 0:
1470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471 return
1472 elif n > self._max_youtube_results:
1473 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1474 n = self._max_youtube_results
1475 self._download_n_results(query, n)
1476 return
1477 except ValueError: # parsing prefix as integer fails
1478 self._download_n_results(query, 1)
1479 return
1480
1481 def _download_n_results(self, query, n):
1482 """Downloads a specified number of results for a query"""
1483
1484 video_ids = []
1485 pagenum = 0
1486 limit = n
1487
1488 while (50 * pagenum) < limit:
1489 self.report_download_page(query, pagenum+1)
1490 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1491 request = compat_urllib_request.Request(result_url)
1492 try:
1493 data = compat_urllib_request.urlopen(request).read()
1494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1495 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1496 return
1497 api_response = json.loads(data)['data']
1498
1499 new_ids = list(video['id'] for video in api_response['items'])
1500 video_ids += new_ids
1501
1502 limit = min(n, api_response['totalItems'])
1503 pagenum += 1
1504
1505 if len(video_ids) > n:
1506 video_ids = video_ids[:n]
1507 for id in video_ids:
1508 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1509 return
1510
1511
1512 class GoogleSearchIE(InfoExtractor):
1513 """Information Extractor for Google Video search queries."""
1514 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1515 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1516 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1517 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1518 _max_google_results = 1000
1519 IE_NAME = u'video.google:search'
1520
1521 def __init__(self, downloader=None):
1522 InfoExtractor.__init__(self, downloader)
1523
1524 def report_download_page(self, query, pagenum):
1525 """Report attempt to download playlist page with given number."""
1526 query = query.decode(preferredencoding())
1527 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1528
1529 def _real_extract(self, query):
1530 mobj = re.match(self._VALID_URL, query)
1531 if mobj is None:
1532 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1533 return
1534
1535 prefix, query = query.split(':')
1536 prefix = prefix[8:]
1537 query = query.encode('utf-8')
1538 if prefix == '':
1539 self._download_n_results(query, 1)
1540 return
1541 elif prefix == 'all':
1542 self._download_n_results(query, self._max_google_results)
1543 return
1544 else:
1545 try:
1546 n = int(prefix)
1547 if n <= 0:
1548 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1549 return
1550 elif n > self._max_google_results:
1551 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1552 n = self._max_google_results
1553 self._download_n_results(query, n)
1554 return
1555 except ValueError: # parsing prefix as integer fails
1556 self._download_n_results(query, 1)
1557 return
1558
1559 def _download_n_results(self, query, n):
1560 """Downloads a specified number of results for a query"""
1561
1562 video_ids = []
1563 pagenum = 0
1564
1565 while True:
1566 self.report_download_page(query, pagenum)
1567 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1568 request = compat_urllib_request.Request(result_url)
1569 try:
1570 page = compat_urllib_request.urlopen(request).read()
1571 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1572 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1573 return
1574
1575 # Extract video identifiers
1576 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1577 video_id = mobj.group(1)
1578 if video_id not in video_ids:
1579 video_ids.append(video_id)
1580 if len(video_ids) == n:
1581 # Specified n videos reached
1582 for id in video_ids:
1583 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1584 return
1585
1586 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1587 for id in video_ids:
1588 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1589 return
1590
1591 pagenum = pagenum + 1
1592
1593
1594 class YahooSearchIE(InfoExtractor):
1595 """Information Extractor for Yahoo! Video search queries."""
1596
1597 _WORKING = False
1598 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1599 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1600 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1601 _MORE_PAGES_INDICATOR = r'\s*Next'
1602 _max_yahoo_results = 1000
1603 IE_NAME = u'video.yahoo:search'
1604
1605 def __init__(self, downloader=None):
1606 InfoExtractor.__init__(self, downloader)
1607
1608 def report_download_page(self, query, pagenum):
1609 """Report attempt to download playlist page with given number."""
1610 query = query.decode(preferredencoding())
1611 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1612
1613 def _real_extract(self, query):
1614 mobj = re.match(self._VALID_URL, query)
1615 if mobj is None:
1616 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1617 return
1618
1619 prefix, query = query.split(':')
1620 prefix = prefix[8:]
1621 query = query.encode('utf-8')
1622 if prefix == '':
1623 self._download_n_results(query, 1)
1624 return
1625 elif prefix == 'all':
1626 self._download_n_results(query, self._max_yahoo_results)
1627 return
1628 else:
1629 try:
1630 n = int(prefix)
1631 if n <= 0:
1632 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1633 return
1634 elif n > self._max_yahoo_results:
1635 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1636 n = self._max_yahoo_results
1637 self._download_n_results(query, n)
1638 return
1639 except ValueError: # parsing prefix as integer fails
1640 self._download_n_results(query, 1)
1641 return
1642
1643 def _download_n_results(self, query, n):
1644 """Downloads a specified number of results for a query"""
1645
1646 video_ids = []
1647 already_seen = set()
1648 pagenum = 1
1649
1650 while True:
1651 self.report_download_page(query, pagenum)
1652 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1653 request = compat_urllib_request.Request(result_url)
1654 try:
1655 page = compat_urllib_request.urlopen(request).read()
1656 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1657 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1658 return
1659
1660 # Extract video identifiers
1661 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1662 video_id = mobj.group(1)
1663 if video_id not in already_seen:
1664 video_ids.append(video_id)
1665 already_seen.add(video_id)
1666 if len(video_ids) == n:
1667 # Specified n videos reached
1668 for id in video_ids:
1669 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1670 return
1671
1672 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1673 for id in video_ids:
1674 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1675 return
1676
1677 pagenum = pagenum + 1
1678
1679
1680 class YoutubePlaylistIE(InfoExtractor):
1681 """Information Extractor for YouTube playlists."""
1682
1683 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1684 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1685 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1686 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1687 IE_NAME = u'youtube:playlist'
1688
1689 def __init__(self, downloader=None):
1690 InfoExtractor.__init__(self, downloader)
1691
1692 def report_download_page(self, playlist_id, pagenum):
1693 """Report attempt to download playlist page with given number."""
1694 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1695
1696 def _real_extract(self, url):
1697 # Extract playlist id
1698 mobj = re.match(self._VALID_URL, url)
1699 if mobj is None:
1700 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1701 return
1702
1703 # Single video case
1704 if mobj.group(3) is not None:
1705 self._downloader.download([mobj.group(3)])
1706 return
1707
1708 # Download playlist pages
1709 # prefix is 'p' as default for playlists but there are other types that need extra care
1710 playlist_prefix = mobj.group(1)
1711 if playlist_prefix == 'a':
1712 playlist_access = 'artist'
1713 else:
1714 playlist_prefix = 'p'
1715 playlist_access = 'view_play_list'
1716 playlist_id = mobj.group(2)
1717 video_ids = []
1718 pagenum = 1
1719
1720 while True:
1721 self.report_download_page(playlist_id, pagenum)
1722 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1723 request = compat_urllib_request.Request(url)
1724 try:
1725 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728 return
1729
1730 # Extract video identifiers
1731 ids_in_page = []
1732 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1733 if mobj.group(1) not in ids_in_page:
1734 ids_in_page.append(mobj.group(1))
1735 video_ids.extend(ids_in_page)
1736
1737 if self._MORE_PAGES_INDICATOR not in page:
1738 break
1739 pagenum = pagenum + 1
1740
1741 total = len(video_ids)
1742
1743 playliststart = self._downloader.params.get('playliststart', 1) - 1
1744 playlistend = self._downloader.params.get('playlistend', -1)
1745 if playlistend == -1:
1746 video_ids = video_ids[playliststart:]
1747 else:
1748 video_ids = video_ids[playliststart:playlistend]
1749
1750 if len(video_ids) == total:
1751 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1752 else:
1753 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1754
1755 for id in video_ids:
1756 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1757 return
1758
1759
1760 class YoutubeChannelIE(InfoExtractor):
1761 """Information Extractor for YouTube channels."""
1762
1763 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1764 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1765 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1766 IE_NAME = u'youtube:channel'
1767
1768 def report_download_page(self, channel_id, pagenum):
1769 """Report attempt to download channel page with given number."""
1770 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1771
1772 def _real_extract(self, url):
1773 # Extract channel id
1774 mobj = re.match(self._VALID_URL, url)
1775 if mobj is None:
1776 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1777 return
1778
1779 # Download channel pages
1780 channel_id = mobj.group(1)
1781 video_ids = []
1782 pagenum = 1
1783
1784 while True:
1785 self.report_download_page(channel_id, pagenum)
1786 url = self._TEMPLATE_URL % (channel_id, pagenum)
1787 request = compat_urllib_request.Request(url)
1788 try:
1789 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1791 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1792 return
1793
1794 # Extract video identifiers
1795 ids_in_page = []
1796 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1797 if mobj.group(1) not in ids_in_page:
1798 ids_in_page.append(mobj.group(1))
1799 video_ids.extend(ids_in_page)
1800
1801 if self._MORE_PAGES_INDICATOR not in page:
1802 break
1803 pagenum = pagenum + 1
1804
1805 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1806
1807 for id in video_ids:
1808 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1809 return
1810
1811
1812 class YoutubeUserIE(InfoExtractor):
1813 """Information Extractor for YouTube users."""
1814
1815 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1816 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1817 _GDATA_PAGE_SIZE = 50
1818 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1819 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1820 IE_NAME = u'youtube:user'
1821
1822 def __init__(self, downloader=None):
1823 InfoExtractor.__init__(self, downloader)
1824
1825 def report_download_page(self, username, start_index):
1826 """Report attempt to download user page."""
1827 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1828 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1829
1830 def _real_extract(self, url):
1831 # Extract username
1832 mobj = re.match(self._VALID_URL, url)
1833 if mobj is None:
1834 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1835 return
1836
1837 username = mobj.group(1)
1838
1839 # Download video ids using YouTube Data API. Result size per
1840 # query is limited (currently to 50 videos) so we need to query
1841 # page by page until there are no video ids - it means we got
1842 # all of them.
1843
1844 video_ids = []
1845 pagenum = 0
1846
1847 while True:
1848 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1849 self.report_download_page(username, start_index)
1850
1851 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1852
1853 try:
1854 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1857 return
1858
1859 # Extract video identifiers
1860 ids_in_page = []
1861
1862 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1863 if mobj.group(1) not in ids_in_page:
1864 ids_in_page.append(mobj.group(1))
1865
1866 video_ids.extend(ids_in_page)
1867
1868 # A little optimization - if current page is not
1869 # "full", ie. does not contain PAGE_SIZE video ids then
1870 # we can assume that this page is the last one - there
1871 # are no more ids on further pages - no need to query
1872 # again.
1873
1874 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1875 break
1876
1877 pagenum += 1
1878
1879 all_ids_count = len(video_ids)
1880 playliststart = self._downloader.params.get('playliststart', 1) - 1
1881 playlistend = self._downloader.params.get('playlistend', -1)
1882
1883 if playlistend == -1:
1884 video_ids = video_ids[playliststart:]
1885 else:
1886 video_ids = video_ids[playliststart:playlistend]
1887
1888 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1889 (username, all_ids_count, len(video_ids)))
1890
1891 for video_id in video_ids:
1892 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1893
1894
1895 class BlipTVUserIE(InfoExtractor):
1896 """Information Extractor for blip.tv users."""
1897
1898 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1899 _PAGE_SIZE = 12
1900 IE_NAME = u'blip.tv:user'
1901
1902 def __init__(self, downloader=None):
1903 InfoExtractor.__init__(self, downloader)
1904
1905 def report_download_page(self, username, pagenum):
1906 """Report attempt to download user page."""
1907 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1908 (self.IE_NAME, username, pagenum))
1909
1910 def _real_extract(self, url):
1911 # Extract username
1912 mobj = re.match(self._VALID_URL, url)
1913 if mobj is None:
1914 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1915 return
1916
1917 username = mobj.group(1)
1918
1919 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1920
1921 request = compat_urllib_request.Request(url)
1922
1923 try:
1924 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925 mobj = re.search(r'data-users-id="([^"]+)"', page)
1926 page_base = page_base % mobj.group(1)
1927 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1929 return
1930
1931
1932 # Download video ids using BlipTV Ajax calls. Result size per
1933 # query is limited (currently to 12 videos) so we need to query
1934 # page by page until there are no video ids - it means we got
1935 # all of them.
1936
1937 video_ids = []
1938 pagenum = 1
1939
1940 while True:
1941 self.report_download_page(username, pagenum)
1942
1943 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1944
1945 try:
1946 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1949 return
1950
1951 # Extract video identifiers
1952 ids_in_page = []
1953
1954 for mobj in re.finditer(r'href="/([^"]+)"', page):
1955 if mobj.group(1) not in ids_in_page:
1956 ids_in_page.append(unescapeHTML(mobj.group(1)))
1957
1958 video_ids.extend(ids_in_page)
1959
1960 # A little optimization - if current page is not
1961 # "full", ie. does not contain PAGE_SIZE video ids then
1962 # we can assume that this page is the last one - there
1963 # are no more ids on further pages - no need to query
1964 # again.
1965
1966 if len(ids_in_page) < self._PAGE_SIZE:
1967 break
1968
1969 pagenum += 1
1970
1971 all_ids_count = len(video_ids)
1972 playliststart = self._downloader.params.get('playliststart', 1) - 1
1973 playlistend = self._downloader.params.get('playlistend', -1)
1974
1975 if playlistend == -1:
1976 video_ids = video_ids[playliststart:]
1977 else:
1978 video_ids = video_ids[playliststart:playlistend]
1979
1980 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1981 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982
1983 for video_id in video_ids:
1984 self._downloader.download([u'http://blip.tv/'+video_id])
1985
1986
1987 class DepositFilesIE(InfoExtractor):
1988 """Information extractor for depositfiles.com"""
1989
1990 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991
1992 def report_download_webpage(self, file_id):
1993 """Report webpage download."""
1994 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995
1996 def report_extraction(self, file_id):
1997 """Report information extraction."""
1998 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999
2000 def _real_extract(self, url):
2001 file_id = url.split('/')[-1]
2002 # Rebuild url in english locale
2003 url = 'http://depositfiles.com/en/files/' + file_id
2004
2005 # Retrieve file webpage with 'Free download' button pressed
2006 free_download_indication = { 'gateway_result' : '1' }
2007 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008 try:
2009 self.report_download_webpage(file_id)
2010 webpage = compat_urllib_request.urlopen(request).read()
2011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2013 return
2014
2015 # Search for the real file URL
2016 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2017 if (mobj is None) or (mobj.group(1) is None):
2018 # Try to figure out reason of the error.
2019 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2020 if (mobj is not None) and (mobj.group(1) is not None):
2021 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2022 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023 else:
2024 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2025 return
2026
2027 file_url = mobj.group(1)
2028 file_extension = os.path.splitext(file_url)[1][1:]
2029
2030 # Search for file title
2031 mobj = re.search(r'<b title="(.*?)">', webpage)
2032 if mobj is None:
2033 self._downloader.trouble(u'ERROR: unable to extract title')
2034 return
2035 file_title = mobj.group(1).decode('utf-8')
2036
2037 return [{
2038 'id': file_id.decode('utf-8'),
2039 'url': file_url.decode('utf-8'),
2040 'uploader': None,
2041 'upload_date': None,
2042 'title': file_title,
2043 'ext': file_extension.decode('utf-8'),
2044 }]
2045
2046
2047 class FacebookIE(InfoExtractor):
2048 """Information Extractor for Facebook"""
2049
2050 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2051 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2052 _NETRC_MACHINE = 'facebook'
2053 IE_NAME = u'facebook'
2054
2055 def report_login(self):
2056 """Report attempt to log in."""
2057 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058
2059 def _real_initialize(self):
2060 if self._downloader is None:
2061 return
2062
2063 useremail = None
2064 password = None
2065 downloader_params = self._downloader.params
2066
2067 # Attempt to use provided username and password or .netrc data
2068 if downloader_params.get('username', None) is not None:
2069 useremail = downloader_params['username']
2070 password = downloader_params['password']
2071 elif downloader_params.get('usenetrc', False):
2072 try:
2073 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2074 if info is not None:
2075 useremail = info[0]
2076 password = info[2]
2077 else:
2078 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2079 except (IOError, netrc.NetrcParseError) as err:
2080 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2081 return
2082
2083 if useremail is None:
2084 return
2085
2086 # Log in
2087 login_form = {
2088 'email': useremail,
2089 'pass': password,
2090 'login': 'Log+In'
2091 }
2092 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2093 try:
2094 self.report_login()
2095 login_results = compat_urllib_request.urlopen(request).read()
2096 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2097 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098 return
2099 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2100 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2101 return
2102
2103 def _real_extract(self, url):
2104 mobj = re.match(self._VALID_URL, url)
2105 if mobj is None:
2106 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107 return
2108 video_id = mobj.group('ID')
2109
2110 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2111 webpage = self._download_webpage(url, video_id)
2112
2113 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2114 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2115 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116 if not m:
2117 raise ExtractorError(u'Cannot parse data')
2118 data = dict(json.loads(m.group(1)))
2119 params_raw = compat_urllib_parse.unquote(data['params'])
2120 params = json.loads(params_raw)
2121 video_url = params['hd_src']
2122 if not video_url:
2123 video_url = params['sd_src']
2124 if not video_url:
2125 raise ExtractorError(u'Cannot find video URL')
2126 video_duration = int(params['video_duration'])
2127
2128 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129 if not m:
2130 raise ExtractorError(u'Cannot find title in webpage')
2131 video_title = unescapeHTML(m.group(1))
2132
2133 info = {
2134 'id': video_id,
2135 'title': video_title,
2136 'url': video_url,
2137 'ext': 'mp4',
2138 'duration': video_duration,
2139 'thumbnail': params['thumbnail_src'],
2140 }
2141 return [info]
2142
2143
2144 class BlipTVIE(InfoExtractor):
2145 """Information extractor for blip.tv"""
2146
2147 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149 IE_NAME = u'blip.tv'
2150
2151 def report_extraction(self, file_id):
2152 """Report information extraction."""
2153 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155 def report_direct_download(self, title):
2156 """Report information extraction."""
2157 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159 def _real_extract(self, url):
2160 mobj = re.match(self._VALID_URL, url)
2161 if mobj is None:
2162 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163 return
2164
2165 if '?' in url:
2166 cchar = '&'
2167 else:
2168 cchar = '?'
2169 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2170 request = compat_urllib_request.Request(json_url)
2171 request.add_header('User-Agent', 'iTunes/10.6.1')
2172 self.report_extraction(mobj.group(1))
2173 info = None
2174 try:
2175 urlh = compat_urllib_request.urlopen(request)
2176 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2177 basename = url.split('/')[-1]
2178 title,ext = os.path.splitext(basename)
2179 title = title.decode('UTF-8')
2180 ext = ext.replace('.', '')
2181 self.report_direct_download(title)
2182 info = {
2183 'id': title,
2184 'url': url,
2185 'uploader': None,
2186 'upload_date': None,
2187 'title': title,
2188 'ext': ext,
2189 'urlhandle': urlh
2190 }
2191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2192 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2193 if info is None: # Regular URL
2194 try:
2195 json_code_bytes = urlh.read()
2196 json_code = json_code_bytes.decode('utf-8')
2197 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2198 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2199 return
2200
2201 try:
2202 json_data = json.loads(json_code)
2203 if 'Post' in json_data:
2204 data = json_data['Post']
2205 else:
2206 data = json_data
2207
2208 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2209 video_url = data['media']['url']
2210 umobj = re.match(self._URL_EXT, video_url)
2211 if umobj is None:
2212 raise ValueError('Can not determine filename extension')
2213 ext = umobj.group(1)
2214
2215 info = {
2216 'id': data['item_id'],
2217 'url': video_url,
2218 'uploader': data['display_name'],
2219 'upload_date': upload_date,
2220 'title': data['title'],
2221 'ext': ext,
2222 'format': data['media']['mimeType'],
2223 'thumbnail': data['thumbnailUrl'],
2224 'description': data['description'],
2225 'player_url': data['embedUrl'],
2226 'user_agent': 'iTunes/10.6.1',
2227 }
2228 except (ValueError,KeyError) as err:
2229 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2230 return
2231
2232 return [info]
2233
2234
2235 class MyVideoIE(InfoExtractor):
2236 """Information Extractor for myvideo.de."""
2237
2238 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2239 IE_NAME = u'myvideo'
2240
2241 def __init__(self, downloader=None):
2242 InfoExtractor.__init__(self, downloader)
2243
2244 def report_extraction(self, video_id):
2245 """Report information extraction."""
2246 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2247
2248 def _real_extract(self,url):
2249 mobj = re.match(self._VALID_URL, url)
2250 if mobj is None:
2251 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2252 return
2253
2254 video_id = mobj.group(1)
2255
2256 # Get video webpage
2257 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2258 webpage = self._download_webpage(webpage_url, video_id)
2259
2260 self.report_extraction(video_id)
2261 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2262 webpage)
2263 if mobj is None:
2264 self._downloader.trouble(u'ERROR: unable to extract media URL')
2265 return
2266 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2267
2268 mobj = re.search('<title>([^<]+)</title>', webpage)
2269 if mobj is None:
2270 self._downloader.trouble(u'ERROR: unable to extract title')
2271 return
2272
2273 video_title = mobj.group(1)
2274
2275 return [{
2276 'id': video_id,
2277 'url': video_url,
2278 'uploader': None,
2279 'upload_date': None,
2280 'title': video_title,
2281 'ext': u'flv',
2282 }]
2283
2284 class ComedyCentralIE(InfoExtractor):
2285 """Information extractor for The Daily Show and Colbert Report """
2286
2287 # urls can be abbreviations like :thedailyshow or :colbert
2288 # urls for episodes like:
2289 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2290 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2291 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2292 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2293 |(https?://)?(www\.)?
2294 (?P<showname>thedailyshow|colbertnation)\.com/
2295 (full-episodes/(?P<episode>.*)|
2296 (?P<clip>
2297 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2298 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2299 $"""
2300
2301 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2302
2303 _video_extensions = {
2304 '3500': 'mp4',
2305 '2200': 'mp4',
2306 '1700': 'mp4',
2307 '1200': 'mp4',
2308 '750': 'mp4',
2309 '400': 'mp4',
2310 }
2311 _video_dimensions = {
2312 '3500': '1280x720',
2313 '2200': '960x540',
2314 '1700': '768x432',
2315 '1200': '640x360',
2316 '750': '512x288',
2317 '400': '384x216',
2318 }
2319
2320 def suitable(self, url):
2321 """Receives a URL and returns True if suitable for this IE."""
2322 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2323
2324 def report_extraction(self, episode_id):
2325 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2326
2327 def report_config_download(self, episode_id, media_id):
2328 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2329
2330 def report_index_download(self, episode_id):
2331 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2332
2333 def _print_formats(self, formats):
2334 print('Available formats:')
2335 for x in formats:
2336 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2337
2338
2339 def _real_extract(self, url):
2340 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2341 if mobj is None:
2342 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2343 return
2344
2345 if mobj.group('shortname'):
2346 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2347 url = u'http://www.thedailyshow.com/full-episodes/'
2348 else:
2349 url = u'http://www.colbertnation.com/full-episodes/'
2350 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2351 assert mobj is not None
2352
2353 if mobj.group('clip'):
2354 if mobj.group('showname') == 'thedailyshow':
2355 epTitle = mobj.group('tdstitle')
2356 else:
2357 epTitle = mobj.group('cntitle')
2358 dlNewest = False
2359 else:
2360 dlNewest = not mobj.group('episode')
2361 if dlNewest:
2362 epTitle = mobj.group('showname')
2363 else:
2364 epTitle = mobj.group('episode')
2365
2366 req = compat_urllib_request.Request(url)
2367 self.report_extraction(epTitle)
2368 try:
2369 htmlHandle = compat_urllib_request.urlopen(req)
2370 html = htmlHandle.read()
2371 webpage = html.decode('utf-8')
2372 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2373 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2374 return
2375 if dlNewest:
2376 url = htmlHandle.geturl()
2377 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2378 if mobj is None:
2379 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2380 return
2381 if mobj.group('episode') == '':
2382 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2383 return
2384 epTitle = mobj.group('episode')
2385
2386 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2387
2388 if len(mMovieParams) == 0:
2389 # The Colbert Report embeds the information in a without
2390 # a URL prefix; so extract the alternate reference
2391 # and then add the URL prefix manually.
2392
2393 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2394 if len(altMovieParams) == 0:
2395 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2396 return
2397 else:
2398 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2399
2400 uri = mMovieParams[0][1]
2401 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2402 self.report_index_download(epTitle)
2403 try:
2404 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2407 return
2408
2409 results = []
2410
2411 idoc = xml.etree.ElementTree.fromstring(indexXml)
2412 itemEls = idoc.findall('.//item')
2413 for partNum,itemEl in enumerate(itemEls):
2414 mediaId = itemEl.findall('./guid')[0].text
2415 shortMediaId = mediaId.split(':')[-1]
2416 showId = mediaId.split(':')[-2].replace('.com', '')
2417 officialTitle = itemEl.findall('./title')[0].text
2418 officialDate = itemEl.findall('./pubDate')[0].text
2419
2420 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2421 compat_urllib_parse.urlencode({'uri': mediaId}))
2422 configReq = compat_urllib_request.Request(configUrl)
2423 self.report_config_download(epTitle, shortMediaId)
2424 try:
2425 configXml = compat_urllib_request.urlopen(configReq).read()
2426 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2427 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2428 return
2429
2430 cdoc = xml.etree.ElementTree.fromstring(configXml)
2431 turls = []
2432 for rendition in cdoc.findall('.//rendition'):
2433 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2434 turls.append(finfo)
2435
2436 if len(turls) == 0:
2437 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2438 continue
2439
2440 if self._downloader.params.get('listformats', None):
2441 self._print_formats([i[0] for i in turls])
2442 return
2443
2444 # For now, just pick the highest bitrate
2445 format,rtmp_video_url = turls[-1]
2446
2447 # Get the format arg from the arg stream
2448 req_format = self._downloader.params.get('format', None)
2449
2450 # Select format if we can find one
2451 for f,v in turls:
2452 if f == req_format:
2453 format, rtmp_video_url = f, v
2454 break
2455
2456 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2457 if not m:
2458 raise ExtractorError(u'Cannot transform RTMP url')
2459 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2460 video_url = base + m.group('finalid')
2461
2462 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2463 info = {
2464 'id': shortMediaId,
2465 'url': video_url,
2466 'uploader': showId,
2467 'upload_date': officialDate,
2468 'title': effTitle,
2469 'ext': 'mp4',
2470 'format': format,
2471 'thumbnail': None,
2472 'description': officialTitle,
2473 }
2474 results.append(info)
2475
2476 return results
2477
2478
2479 class EscapistIE(InfoExtractor):
2480 """Information extractor for The Escapist """
2481
2482 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2483 IE_NAME = u'escapist'
2484
2485 def report_extraction(self, showName):
2486 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2487
2488 def report_config_download(self, showName):
2489 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2490
2491 def _real_extract(self, url):
2492 mobj = re.match(self._VALID_URL, url)
2493 if mobj is None:
2494 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2495 return
2496 showName = mobj.group('showname')
2497 videoId = mobj.group('episode')
2498
2499 self.report_extraction(showName)
2500 try:
2501 webPage = compat_urllib_request.urlopen(url)
2502 webPageBytes = webPage.read()
2503 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2504 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2507 return
2508
2509 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2510 description = unescapeHTML(descMatch.group(1))
2511 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2512 imgUrl = unescapeHTML(imgMatch.group(1))
2513 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2514 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2515 configUrlMatch = re.search('config=(.*)$', playerUrl)
2516 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2517
2518 self.report_config_download(showName)
2519 try:
2520 configJSON = compat_urllib_request.urlopen(configUrl)
2521 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2522 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2523 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2524 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2525 return
2526
2527 # Technically, it's JavaScript, not JSON
2528 configJSON = configJSON.replace("'", '"')
2529
2530 try:
2531 config = json.loads(configJSON)
2532 except (ValueError,) as err:
2533 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2534 return
2535
2536 playlist = config['playlist']
2537 videoUrl = playlist[1]['url']
2538
2539 info = {
2540 'id': videoId,
2541 'url': videoUrl,
2542 'uploader': showName,
2543 'upload_date': None,
2544 'title': showName,
2545 'ext': 'flv',
2546 'thumbnail': imgUrl,
2547 'description': description,
2548 'player_url': playerUrl,
2549 }
2550
2551 return [info]
2552
2553 class CollegeHumorIE(InfoExtractor):
2554 """Information extractor for collegehumor.com"""
2555
2556 _WORKING = False
2557 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2558 IE_NAME = u'collegehumor'
2559
2560 def report_manifest(self, video_id):
2561 """Report information extraction."""
2562 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2563
2564 def report_extraction(self, video_id):
2565 """Report information extraction."""
2566 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2567
2568 def _real_extract(self, url):
2569 mobj = re.match(self._VALID_URL, url)
2570 if mobj is None:
2571 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2572 return
2573 video_id = mobj.group('videoid')
2574
2575 info = {
2576 'id': video_id,
2577 'uploader': None,
2578 'upload_date': None,
2579 }
2580
2581 self.report_extraction(video_id)
2582 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2583 try:
2584 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2587 return
2588
2589 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2590 try:
2591 videoNode = mdoc.findall('./video')[0]
2592 info['description'] = videoNode.findall('./description')[0].text
2593 info['title'] = videoNode.findall('./caption')[0].text
2594 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2595 manifest_url = videoNode.findall('./file')[0].text
2596 except IndexError:
2597 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2598 return
2599
2600 manifest_url += '?hdcore=2.10.3'
2601 self.report_manifest(video_id)
2602 try:
2603 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2604 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2606 return
2607
2608 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2609 try:
2610 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2611 node_id = media_node.attrib['url']
2612 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2613 except IndexError as err:
2614 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2615 return
2616
2617 url_pr = compat_urllib_parse_urlparse(manifest_url)
2618 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2619
2620 info['url'] = url
2621 info['ext'] = 'f4f'
2622 return [info]
2623
2624
2625 class XVideosIE(InfoExtractor):
2626 """Information extractor for xvideos.com"""
2627
2628 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2629 IE_NAME = u'xvideos'
2630
2631 def report_extraction(self, video_id):
2632 """Report information extraction."""
2633 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2634
2635 def _real_extract(self, url):
2636 mobj = re.match(self._VALID_URL, url)
2637 if mobj is None:
2638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639 return
2640 video_id = mobj.group(1)
2641
2642 webpage = self._download_webpage(url, video_id)
2643
2644 self.report_extraction(video_id)
2645
2646
2647 # Extract video URL
2648 mobj = re.search(r'flv_url=(.+?)&', webpage)
2649 if mobj is None:
2650 self._downloader.trouble(u'ERROR: unable to extract video url')
2651 return
2652 video_url = compat_urllib_parse.unquote(mobj.group(1))
2653
2654
2655 # Extract title
2656 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2657 if mobj is None:
2658 self._downloader.trouble(u'ERROR: unable to extract video title')
2659 return
2660 video_title = mobj.group(1)
2661
2662
2663 # Extract video thumbnail
2664 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2665 if mobj is None:
2666 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2667 return
2668 video_thumbnail = mobj.group(0)
2669
2670 info = {
2671 'id': video_id,
2672 'url': video_url,
2673 'uploader': None,
2674 'upload_date': None,
2675 'title': video_title,
2676 'ext': 'flv',
2677 'thumbnail': video_thumbnail,
2678 'description': None,
2679 }
2680
2681 return [info]
2682
2683
2684 class SoundcloudIE(InfoExtractor):
2685 """Information extractor for soundcloud.com
2686 To access the media, the uid of the song and a stream token
2687 must be extracted from the page source and the script must make
2688 a request to media.soundcloud.com/crossdomain.xml. Then
2689 the media can be grabbed by requesting from an url composed
2690 of the stream token and uid
2691 """
2692
2693 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2694 IE_NAME = u'soundcloud'
2695
2696 def __init__(self, downloader=None):
2697 InfoExtractor.__init__(self, downloader)
2698
2699 def report_resolve(self, video_id):
2700 """Report information extraction."""
2701 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2702
2703 def report_extraction(self, video_id):
2704 """Report information extraction."""
2705 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2706
2707 def _real_extract(self, url):
2708 mobj = re.match(self._VALID_URL, url)
2709 if mobj is None:
2710 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2711 return
2712
2713 # extract uploader (which is in the url)
2714 uploader = mobj.group(1)
2715 # extract simple title (uploader + slug of song title)
2716 slug_title = mobj.group(2)
2717 simple_title = uploader + u'-' + slug_title
2718
2719 self.report_resolve('%s/%s' % (uploader, slug_title))
2720
2721 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2722 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2723 request = compat_urllib_request.Request(resolv_url)
2724 try:
2725 info_json_bytes = compat_urllib_request.urlopen(request).read()
2726 info_json = info_json_bytes.decode('utf-8')
2727 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2728 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2729 return
2730
2731 info = json.loads(info_json)
2732 video_id = info['id']
2733 self.report_extraction('%s/%s' % (uploader, slug_title))
2734
2735 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736 request = compat_urllib_request.Request(streams_url)
2737 try:
2738 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2739 stream_json = stream_json_bytes.decode('utf-8')
2740 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2742 return
2743
2744 streams = json.loads(stream_json)
2745 mediaURL = streams['http_mp3_128_url']
2746
2747 return [{
2748 'id': info['id'],
2749 'url': mediaURL,
2750 'uploader': info['user']['username'],
2751 'upload_date': info['created_at'],
2752 'title': info['title'],
2753 'ext': u'mp3',
2754 'description': info['description'],
2755 }]
2756
2757
2758 class InfoQIE(InfoExtractor):
2759 """Information extractor for infoq.com"""
2760 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2761
2762 def report_extraction(self, video_id):
2763 """Report information extraction."""
2764 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2765
2766 def _real_extract(self, url):
2767 mobj = re.match(self._VALID_URL, url)
2768 if mobj is None:
2769 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2770 return
2771
2772 webpage = self._download_webpage(url, video_id=url)
2773 self.report_extraction(url)
2774
2775 # Extract video URL
2776 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2777 if mobj is None:
2778 self._downloader.trouble(u'ERROR: unable to extract video url')
2779 return
2780 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2781 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2782
2783 # Extract title
2784 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2785 if mobj is None:
2786 self._downloader.trouble(u'ERROR: unable to extract video title')
2787 return
2788 video_title = mobj.group(1)
2789
2790 # Extract description
2791 video_description = u'No description available.'
2792 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2793 if mobj is not None:
2794 video_description = mobj.group(1)
2795
2796 video_filename = video_url.split('/')[-1]
2797 video_id, extension = video_filename.split('.')
2798
2799 info = {
2800 'id': video_id,
2801 'url': video_url,
2802 'uploader': None,
2803 'upload_date': None,
2804 'title': video_title,
2805 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2806 'thumbnail': None,
2807 'description': video_description,
2808 }
2809
2810 return [info]
2811
2812 class MixcloudIE(InfoExtractor):
2813 """Information extractor for www.mixcloud.com"""
2814
2815 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2816 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2817 IE_NAME = u'mixcloud'
2818
2819 def __init__(self, downloader=None):
2820 InfoExtractor.__init__(self, downloader)
2821
2822 def report_download_json(self, file_id):
2823 """Report JSON download."""
2824 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2825
2826 def report_extraction(self, file_id):
2827 """Report information extraction."""
2828 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2829
2830 def get_urls(self, jsonData, fmt, bitrate='best'):
2831 """Get urls from 'audio_formats' section in json"""
2832 file_url = None
2833 try:
2834 bitrate_list = jsonData[fmt]
2835 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2836 bitrate = max(bitrate_list) # select highest
2837
2838 url_list = jsonData[fmt][bitrate]
2839 except TypeError: # we have no bitrate info.
2840 url_list = jsonData[fmt]
2841 return url_list
2842
2843 def check_urls(self, url_list):
2844 """Returns 1st active url from list"""
2845 for url in url_list:
2846 try:
2847 compat_urllib_request.urlopen(url)
2848 return url
2849 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2850 url = None
2851
2852 return None
2853
2854 def _print_formats(self, formats):
2855 print('Available formats:')
2856 for fmt in formats.keys():
2857 for b in formats[fmt]:
2858 try:
2859 ext = formats[fmt][b][0]
2860 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2861 except TypeError: # we have no bitrate info
2862 ext = formats[fmt][0]
2863 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2864 break
2865
2866 def _real_extract(self, url):
2867 mobj = re.match(self._VALID_URL, url)
2868 if mobj is None:
2869 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2870 return
2871 # extract uploader & filename from url
2872 uploader = mobj.group(1).decode('utf-8')
2873 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2874
2875 # construct API request
2876 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2877 # retrieve .json file with links to files
2878 request = compat_urllib_request.Request(file_url)
2879 try:
2880 self.report_download_json(file_url)
2881 jsonData = compat_urllib_request.urlopen(request).read()
2882 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2883 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2884 return
2885
2886 # parse JSON
2887 json_data = json.loads(jsonData)
2888 player_url = json_data['player_swf_url']
2889 formats = dict(json_data['audio_formats'])
2890
2891 req_format = self._downloader.params.get('format', None)
2892 bitrate = None
2893
2894 if self._downloader.params.get('listformats', None):
2895 self._print_formats(formats)
2896 return
2897
2898 if req_format is None or req_format == 'best':
2899 for format_param in formats.keys():
2900 url_list = self.get_urls(formats, format_param)
2901 # check urls
2902 file_url = self.check_urls(url_list)
2903 if file_url is not None:
2904 break # got it!
2905 else:
2906 if req_format not in formats:
2907 self._downloader.trouble(u'ERROR: format is not available')
2908 return
2909
2910 url_list = self.get_urls(formats, req_format)
2911 file_url = self.check_urls(url_list)
2912 format_param = req_format
2913
2914 return [{
2915 'id': file_id.decode('utf-8'),
2916 'url': file_url.decode('utf-8'),
2917 'uploader': uploader.decode('utf-8'),
2918 'upload_date': None,
2919 'title': json_data['name'],
2920 'ext': file_url.split('.')[-1].decode('utf-8'),
2921 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2922 'thumbnail': json_data['thumbnail_url'],
2923 'description': json_data['description'],
2924 'player_url': player_url.decode('utf-8'),
2925 }]
2926
2927 class StanfordOpenClassroomIE(InfoExtractor):
2928 """Information extractor for Stanford's Open ClassRoom"""
2929
2930 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2931 IE_NAME = u'stanfordoc'
2932
2933 def report_download_webpage(self, objid):
2934 """Report information extraction."""
2935 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2936
2937 def report_extraction(self, video_id):
2938 """Report information extraction."""
2939 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2940
2941 def _real_extract(self, url):
2942 mobj = re.match(self._VALID_URL, url)
2943 if mobj is None:
2944 raise ExtractorError(u'Invalid URL: %s' % url)
2945
2946 if mobj.group('course') and mobj.group('video'): # A specific video
2947 course = mobj.group('course')
2948 video = mobj.group('video')
2949 info = {
2950 'id': course + '_' + video,
2951 'uploader': None,
2952 'upload_date': None,
2953 }
2954
2955 self.report_extraction(info['id'])
2956 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2957 xmlUrl = baseUrl + video + '.xml'
2958 try:
2959 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2961 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2962 return
2963 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2964 try:
2965 info['title'] = mdoc.findall('./title')[0].text
2966 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2967 except IndexError:
2968 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2969 return
2970 info['ext'] = info['url'].rpartition('.')[2]
2971 return [info]
2972 elif mobj.group('course'): # A course page
2973 course = mobj.group('course')
2974 info = {
2975 'id': course,
2976 'type': 'playlist',
2977 'uploader': None,
2978 'upload_date': None,
2979 }
2980
2981 coursepage = self._download_webpage(url, info['id'],
2982 note='Downloading course info page',
2983 errnote='Unable to download course info page')
2984
2985 m = re.search('<h1>([^<]+)</h1>', coursepage)
2986 if m:
2987 info['title'] = unescapeHTML(m.group(1))
2988 else:
2989 info['title'] = info['id']
2990
2991 m = re.search('<description>([^<]+)</description>', coursepage)
2992 if m:
2993 info['description'] = unescapeHTML(m.group(1))
2994
2995 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2996 info['list'] = [
2997 {
2998 'type': 'reference',
2999 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3000 }
3001 for vpage in links]
3002 results = []
3003 for entry in info['list']:
3004 assert entry['type'] == 'reference'
3005 results += self.extract(entry['url'])
3006 return results
3007 else: # Root page
3008 info = {
3009 'id': 'Stanford OpenClassroom',
3010 'type': 'playlist',
3011 'uploader': None,
3012 'upload_date': None,
3013 }
3014
3015 self.report_download_webpage(info['id'])
3016 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3017 try:
3018 rootpage = compat_urllib_request.urlopen(rootURL).read()
3019 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3021 return
3022
3023 info['title'] = info['id']
3024
3025 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3026 info['list'] = [
3027 {
3028 'type': 'reference',
3029 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3030 }
3031 for cpage in links]
3032
3033 results = []
3034 for entry in info['list']:
3035 assert entry['type'] == 'reference'
3036 results += self.extract(entry['url'])
3037 return results
3038
3039 class MTVIE(InfoExtractor):
3040 """Information extractor for MTV.com"""
3041
3042 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3043 IE_NAME = u'mtv'
3044
3045 def report_extraction(self, video_id):
3046 """Report information extraction."""
3047 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3048
3049 def _real_extract(self, url):
3050 mobj = re.match(self._VALID_URL, url)
3051 if mobj is None:
3052 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3053 return
3054 if not mobj.group('proto'):
3055 url = 'http://' + url
3056 video_id = mobj.group('videoid')
3057
3058 webpage = self._download_webpage(url, video_id)
3059
3060 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3061 if mobj is None:
3062 self._downloader.trouble(u'ERROR: unable to extract song name')
3063 return
3064 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3065 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3066 if mobj is None:
3067 self._downloader.trouble(u'ERROR: unable to extract performer')
3068 return
3069 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3070 video_title = performer + ' - ' + song_name
3071
3072 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3073 if mobj is None:
3074 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3075 return
3076 mtvn_uri = mobj.group(1)
3077
3078 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3079 if mobj is None:
3080 self._downloader.trouble(u'ERROR: unable to extract content id')
3081 return
3082 content_id = mobj.group(1)
3083
3084 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3085 self.report_extraction(video_id)
3086 request = compat_urllib_request.Request(videogen_url)
3087 try:
3088 metadataXml = compat_urllib_request.urlopen(request).read()
3089 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3090 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3091 return
3092
3093 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3094 renditions = mdoc.findall('.//rendition')
3095
3096 # For now, always pick the highest quality.
3097 rendition = renditions[-1]
3098
3099 try:
3100 _,_,ext = rendition.attrib['type'].partition('/')
3101 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3102 video_url = rendition.find('./src').text
3103 except KeyError:
3104 self._downloader.trouble('Invalid rendition field.')
3105 return
3106
3107 info = {
3108 'id': video_id,
3109 'url': video_url,
3110 'uploader': performer,
3111 'upload_date': None,
3112 'title': video_title,
3113 'ext': ext,
3114 'format': format,
3115 }
3116
3117 return [info]
3118
3119
3120 class YoukuIE(InfoExtractor):
3121 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3122
3123 def report_download_webpage(self, file_id):
3124 """Report webpage download."""
3125 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3126
3127 def report_extraction(self, file_id):
3128 """Report information extraction."""
3129 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3130
3131 def _gen_sid(self):
3132 nowTime = int(time.time() * 1000)
3133 random1 = random.randint(1000,1998)
3134 random2 = random.randint(1000,9999)
3135
3136 return "%d%d%d" %(nowTime,random1,random2)
3137
3138 def _get_file_ID_mix_string(self, seed):
3139 mixed = []
3140 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3141 seed = float(seed)
3142 for i in range(len(source)):
3143 seed = (seed * 211 + 30031 ) % 65536
3144 index = math.floor(seed / 65536 * len(source) )
3145 mixed.append(source[int(index)])
3146 source.remove(source[int(index)])
3147 #return ''.join(mixed)
3148 return mixed
3149
3150 def _get_file_id(self, fileId, seed):
3151 mixed = self._get_file_ID_mix_string(seed)
3152 ids = fileId.split('*')
3153 realId = []
3154 for ch in ids:
3155 if ch:
3156 realId.append(mixed[int(ch)])
3157 return ''.join(realId)
3158
3159 def _real_extract(self, url):
3160 mobj = re.match(self._VALID_URL, url)
3161 if mobj is None:
3162 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3163 return
3164 video_id = mobj.group('ID')
3165
3166 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3167
3168 request = compat_urllib_request.Request(info_url, None, std_headers)
3169 try:
3170 self.report_download_webpage(video_id)
3171 jsondata = compat_urllib_request.urlopen(request).read()
3172 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3173 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3174 return
3175
3176 self.report_extraction(video_id)
3177 try:
3178 jsonstr = jsondata.decode('utf-8')
3179 config = json.loads(jsonstr)
3180
3181 video_title = config['data'][0]['title']
3182 seed = config['data'][0]['seed']
3183
3184 format = self._downloader.params.get('format', None)
3185 supported_format = list(config['data'][0]['streamfileids'].keys())
3186
3187 if format is None or format == 'best':
3188 if 'hd2' in supported_format:
3189 format = 'hd2'
3190 else:
3191 format = 'flv'
3192 ext = u'flv'
3193 elif format == 'worst':
3194 format = 'mp4'
3195 ext = u'mp4'
3196 else:
3197 format = 'flv'
3198 ext = u'flv'
3199
3200
3201 fileid = config['data'][0]['streamfileids'][format]
3202 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3203 except (UnicodeDecodeError, ValueError, KeyError):
3204 self._downloader.trouble(u'ERROR: unable to extract info section')
3205 return
3206
3207 files_info=[]
3208 sid = self._gen_sid()
3209 fileid = self._get_file_id(fileid, seed)
3210
3211 #column 8,9 of fileid represent the segment number
3212 #fileid[7:9] should be changed
3213 for index, key in enumerate(keys):
3214
3215 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3216 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3217
3218 info = {
3219 'id': '%s_part%02d' % (video_id, index),
3220 'url': download_url,
3221 'uploader': None,
3222 'upload_date': None,
3223 'title': video_title,
3224 'ext': ext,
3225 }
3226 files_info.append(info)
3227
3228 return files_info
3229
3230
3231 class XNXXIE(InfoExtractor):
3232 """Information extractor for xnxx.com"""
3233
3234 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3235 IE_NAME = u'xnxx'
3236 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3237 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3238 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3239
3240 def report_webpage(self, video_id):
3241 """Report information extraction"""
3242 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3243
3244 def report_extraction(self, video_id):
3245 """Report information extraction"""
3246 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3247
3248 def _real_extract(self, url):
3249 mobj = re.match(self._VALID_URL, url)
3250 if mobj is None:
3251 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3252 return
3253 video_id = mobj.group(1)
3254
3255 self.report_webpage(video_id)
3256
3257 # Get webpage content
3258 try:
3259 webpage_bytes = compat_urllib_request.urlopen(url).read()
3260 webpage = webpage_bytes.decode('utf-8')
3261 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3262 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3263 return
3264
3265 result = re.search(self.VIDEO_URL_RE, webpage)
3266 if result is None:
3267 self._downloader.trouble(u'ERROR: unable to extract video url')
3268 return
3269 video_url = compat_urllib_parse.unquote(result.group(1))
3270
3271 result = re.search(self.VIDEO_TITLE_RE, webpage)
3272 if result is None:
3273 self._downloader.trouble(u'ERROR: unable to extract video title')
3274 return
3275 video_title = result.group(1)
3276
3277 result = re.search(self.VIDEO_THUMB_RE, webpage)
3278 if result is None:
3279 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3280 return
3281 video_thumbnail = result.group(1)
3282
3283 return [{
3284 'id': video_id,
3285 'url': video_url,
3286 'uploader': None,
3287 'upload_date': None,
3288 'title': video_title,
3289 'ext': 'flv',
3290 'thumbnail': video_thumbnail,
3291 'description': None,
3292 }]
3293
3294
3295 class GooglePlusIE(InfoExtractor):
3296 """Information extractor for plus.google.com."""
3297
3298 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3299 IE_NAME = u'plus.google'
3300
3301 def __init__(self, downloader=None):
3302 InfoExtractor.__init__(self, downloader)
3303
3304 def report_extract_entry(self, url):
3305 """Report downloading extry"""
3306 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3307
3308 def report_date(self, upload_date):
3309 """Report downloading extry"""
3310 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3311
3312 def report_uploader(self, uploader):
3313 """Report downloading extry"""
3314 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3315
3316 def report_title(self, video_title):
3317 """Report downloading extry"""
3318 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3319
3320 def report_extract_vid_page(self, video_page):
3321 """Report information extraction."""
3322 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3323
3324 def _real_extract(self, url):
3325 # Extract id from URL
3326 mobj = re.match(self._VALID_URL, url)
3327 if mobj is None:
3328 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3329 return
3330
3331 post_url = mobj.group(0)
3332 video_id = mobj.group(1)
3333
3334 video_extension = 'flv'
3335
3336 # Step 1, Retrieve post webpage to extract further information
3337 self.report_extract_entry(post_url)
3338 request = compat_urllib_request.Request(post_url)
3339 try:
3340 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3341 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3342 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3343 return
3344
3345 # Extract update date
3346 upload_date = None
3347 pattern = 'title="Timestamp">(.*?)</a>'
3348 mobj = re.search(pattern, webpage)
3349 if mobj:
3350 upload_date = mobj.group(1)
3351 # Convert timestring to a format suitable for filename
3352 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3353 upload_date = upload_date.strftime('%Y%m%d')
3354 self.report_date(upload_date)
3355
3356 # Extract uploader
3357 uploader = None
3358 pattern = r'rel\="author".*?>(.*?)</a>'
3359 mobj = re.search(pattern, webpage)
3360 if mobj:
3361 uploader = mobj.group(1)
3362 self.report_uploader(uploader)
3363
3364 # Extract title
3365 # Get the first line for title
3366 video_title = u'NA'
3367 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3368 mobj = re.search(pattern, webpage)
3369 if mobj:
3370 video_title = mobj.group(1)
3371 self.report_title(video_title)
3372
3373 # Step 2, Stimulate clicking the image box to launch video
3374 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3375 mobj = re.search(pattern, webpage)
3376 if mobj is None:
3377 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3378
3379 video_page = mobj.group(1)
3380 request = compat_urllib_request.Request(video_page)
3381 try:
3382 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3383 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3384 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3385 return
3386 self.report_extract_vid_page(video_page)
3387
3388
3389 # Extract video links on video page
3390 """Extract video links of all sizes"""
3391 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3392 mobj = re.findall(pattern, webpage)
3393 if len(mobj) == 0:
3394 self._downloader.trouble(u'ERROR: unable to extract video links')
3395
3396 # Sort in resolution
3397 links = sorted(mobj)
3398
3399 # Choose the lowest of the sort, i.e. highest resolution
3400 video_url = links[-1]
3401 # Only get the url. The resolution part in the tuple has no use anymore
3402 video_url = video_url[-1]
3403 # Treat escaped \u0026 style hex
3404 try:
3405 video_url = video_url.decode("unicode_escape")
3406 except AttributeError: # Python 3
3407 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3408
3409
3410 return [{
3411 'id': video_id,
3412 'url': video_url,
3413 'uploader': uploader,
3414 'upload_date': upload_date,
3415 'title': video_title,
3416 'ext': video_extension,
3417 }]
3418
3419 class NBAIE(InfoExtractor):
3420 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3421 IE_NAME = u'nba'
3422
3423 def _real_extract(self, url):
3424 mobj = re.match(self._VALID_URL, url)
3425 if mobj is None:
3426 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3427 return
3428
3429 video_id = mobj.group(1)
3430 if video_id.endswith('/index.html'):
3431 video_id = video_id[:-len('/index.html')]
3432
3433 webpage = self._download_webpage(url, video_id)
3434
3435 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3436 def _findProp(rexp, default=None):
3437 m = re.search(rexp, webpage)
3438 if m:
3439 return unescapeHTML(m.group(1))
3440 else:
3441 return default
3442
3443 shortened_video_id = video_id.rpartition('/')[2]
3444 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3445 info = {
3446 'id': shortened_video_id,
3447 'url': video_url,
3448 'ext': 'mp4',
3449 'title': title,
3450 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3451 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3452 }
3453 return [info]
3454
3455 class JustinTVIE(InfoExtractor):
3456 """Information extractor for justin.tv and twitch.tv"""
3457 # TODO: One broadcast may be split into multiple videos. The key
3458 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3459 # starts at 1 and increases. Can we treat all parts as one video?
3460
3461 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3462 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3463 _JUSTIN_PAGE_LIMIT = 100
3464 IE_NAME = u'justin.tv'
3465
3466 def report_extraction(self, file_id):
3467 """Report information extraction."""
3468 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3469
3470 def report_download_page(self, channel, offset):
3471 """Report attempt to download a single page of videos."""
3472 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3473 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3474
3475 # Return count of items, list of *valid* items
3476 def _parse_page(self, url):
3477 try:
3478 urlh = compat_urllib_request.urlopen(url)
3479 webpage_bytes = urlh.read()
3480 webpage = webpage_bytes.decode('utf-8', 'ignore')
3481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3482 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3483 return
3484
3485 response = json.loads(webpage)
3486 if type(response) != list:
3487 error_text = response.get('error', 'unknown error')
3488 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3489 return
3490 info = []
3491 for clip in response:
3492 video_url = clip['video_file_url']
3493 if video_url:
3494 video_extension = os.path.splitext(video_url)[1][1:]
3495 video_date = re.sub('-', '', clip['start_time'][:10])
3496 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3497 video_id = clip['id']
3498 video_title = clip.get('title', video_id)
3499 info.append({
3500 'id': video_id,
3501 'url': video_url,
3502 'title': video_title,
3503 'uploader': clip.get('channel_name', video_uploader_id),
3504 'uploader_id': video_uploader_id,
3505 'upload_date': video_date,
3506 'ext': video_extension,
3507 })
3508 return (len(response), info)
3509
3510 def _real_extract(self, url):
3511 mobj = re.match(self._VALID_URL, url)
3512 if mobj is None:
3513 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3514 return
3515
3516 api = 'http://api.justin.tv'
3517 video_id = mobj.group(mobj.lastindex)
3518 paged = False
3519 if mobj.lastindex == 1:
3520 paged = True
3521 api += '/channel/archives/%s.json'
3522 else:
3523 api += '/broadcast/by_archive/%s.json'
3524 api = api % (video_id,)
3525
3526 self.report_extraction(video_id)
3527
3528 info = []
3529 offset = 0
3530 limit = self._JUSTIN_PAGE_LIMIT
3531 while True:
3532 if paged:
3533 self.report_download_page(video_id, offset)
3534 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3535 page_count, page_info = self._parse_page(page_url)
3536 info.extend(page_info)
3537 if not paged or page_count != limit:
3538 break
3539 offset += limit
3540 return info
3541
3542 class FunnyOrDieIE(InfoExtractor):
3543 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3544
3545 def _real_extract(self, url):
3546 mobj = re.match(self._VALID_URL, url)
3547 if mobj is None:
3548 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3549 return
3550
3551 video_id = mobj.group('id')
3552 webpage = self._download_webpage(url, video_id)
3553
3554 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3555 if not m:
3556 self._downloader.trouble(u'ERROR: unable to find video information')
3557 video_url = unescapeHTML(m.group('url'))
3558
3559 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3560 if not m:
3561 self._downloader.trouble(u'Cannot find video title')
3562 title = unescapeHTML(m.group('title'))
3563
3564 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3565 if m:
3566 desc = unescapeHTML(m.group('desc'))
3567 else:
3568 desc = None
3569
3570 info = {
3571 'id': video_id,
3572 'url': video_url,
3573 'ext': 'mp4',
3574 'title': title,
3575 'description': desc,
3576 }
3577 return [info]
3578
3579 class TweetReelIE(InfoExtractor):
3580 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3581
3582 def _real_extract(self, url):
3583 mobj = re.match(self._VALID_URL, url)
3584 if mobj is None:
3585 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3586 return
3587
3588 video_id = mobj.group('id')
3589 webpage = self._download_webpage(url, video_id)
3590
3591 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3592 if not m:
3593 self._downloader.trouble(u'ERROR: Cannot find status ID')
3594 status_id = m.group(1)
3595
3596 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3597 if not m:
3598 self._downloader.trouble(u'WARNING: Cannot find description')
3599 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3600
3601 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3602 if not m:
3603 self._downloader.trouble(u'ERROR: Cannot find uploader')
3604 uploader = unescapeHTML(m.group('uploader'))
3605 uploader_id = unescapeHTML(m.group('uploader_id'))
3606
3607 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3608 if not m:
3609 self._downloader.trouble(u'ERROR: Cannot find upload date')
3610 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3611
3612 title = desc
3613 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3614
3615 info = {
3616 'id': video_id,
3617 'url': video_url,
3618 'ext': 'mov',
3619 'title': title,
3620 'description': desc,
3621 'uploader': uploader,
3622 'uploader_id': uploader_id,
3623 'internal_id': status_id,
3624 'upload_date': upload_date
3625 }
3626 return [info]
3627
3628 class SteamIE(InfoExtractor):
3629 _VALID_URL = r"""http://store.steampowered.com/
3630 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3631 (?P<gameID>\d+)/?
3632 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3633 """
3634
3635 def suitable(self, url):
3636 """Receives a URL and returns True if suitable for this IE."""
3637 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3638
3639 def _real_extract(self, url):
3640 m = re.match(self._VALID_URL, url, re.VERBOSE)
3641 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3642 gameID = m.group('gameID')
3643 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3644 webpage = self._download_webpage(videourl, gameID)
3645 mweb = re.finditer(urlRE, webpage)
3646 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3647 titles = re.finditer(namesRE, webpage)
3648 videos = []
3649 for vid,vtitle in zip(mweb,titles):
3650 video_id = vid.group('videoID')
3651 title = vtitle.group('videoName')
3652 video_url = vid.group('videoURL')
3653 if not video_url:
3654 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3655 info = {
3656 'id':video_id,
3657 'url':video_url,
3658 'ext': 'flv',
3659 'title': unescapeHTML(title)
3660 }
3661 videos.append(info)
3662 return videos
3663
3664 class UstreamIE(InfoExtractor):
3665 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3666 IE_NAME = u'ustream'
3667
3668 def _real_extract(self, url):
3669 m = re.match(self._VALID_URL, url)
3670 video_id = m.group('videoID')
3671 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3672 webpage = self._download_webpage(url, video_id)
3673 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3674 title = m.group('title')
3675 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3676 uploader = m.group('uploader')
3677 info = {
3678 'id':video_id,
3679 'url':video_url,
3680 'ext': 'flv',
3681 'title': title,
3682 'uploader': uploader
3683 }
3684 return [info]
3685
3686 class RBMARadioIE(InfoExtractor):
3687 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3688
3689 def _real_extract(self, url):
3690 m = re.match(self._VALID_URL, url)
3691 video_id = m.group('videoID')
3692
3693 webpage = self._download_webpage(url, video_id)
3694 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3695 if not m:
3696 raise ExtractorError(u'Cannot find metadata')
3697 json_data = m.group(1)
3698
3699 try:
3700 data = json.loads(json_data)
3701 except ValueError as e:
3702 raise ExtractorError(u'Invalid JSON: ' + str(e))
3703
3704 video_url = data['akamai_url'] + '&cbr=256'
3705 url_parts = compat_urllib_parse_urlparse(video_url)
3706 video_ext = url_parts.path.rpartition('.')[2]
3707 info = {
3708 'id': video_id,
3709 'url': video_url,
3710 'ext': video_ext,
3711 'title': data['title'],
3712 'description': data.get('teaser_text'),
3713 'location': data.get('country_of_origin'),
3714 'uploader': data.get('host', {}).get('name'),
3715 'uploader_id': data.get('host', {}).get('slug'),
3716 'thumbnail': data.get('image', {}).get('large_url_2x'),
3717 'duration': data.get('duration'),
3718 }
3719 return [info]
3720
3721
3722 class YouPornIE(InfoExtractor):
3723 """Information extractor for youporn.com."""
3724 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3725
3726 def _print_formats(self, formats):
3727 """Print all available formats"""
3728 print(u'Available formats:')
3729 print(u'ext\t\tformat')
3730 print(u'---------------------------------')
3731 for format in formats:
3732 print(u'%s\t\t%s' % (format['ext'], format['format']))
3733
3734 def _specific(self, req_format, formats):
3735 for x in formats:
3736 if(x["format"]==req_format):
3737 return x
3738 return None
3739
3740 def _real_extract(self, url):
3741 mobj = re.match(self._VALID_URL, url)
3742 if mobj is None:
3743 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3744 return
3745
3746 video_id = mobj.group('videoid')
3747
3748 req = compat_urllib_request.Request(url)
3749 req.add_header('Cookie', 'age_verified=1')
3750 webpage = self._download_webpage(req, video_id)
3751
3752 # Get the video title
3753 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3754 if result is None:
3755 raise ExtractorError(u'Unable to extract video title')
3756 video_title = result.group('title').strip()
3757
3758 # Get the video date
3759 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3760 if result is None:
3761 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3762 upload_date = None
3763 else:
3764 upload_date = result.group('date').strip()
3765
3766 # Get the video uploader
3767 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3768 if result is None:
3769 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3770 video_uploader = None
3771 else:
3772 video_uploader = result.group('uploader').strip()
3773 video_uploader = clean_html( video_uploader )
3774
3775 # Get all of the formats available
3776 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3777 result = re.search(DOWNLOAD_LIST_RE, webpage)
3778 if result is None:
3779 raise ExtractorError(u'Unable to extract download list')
3780 download_list_html = result.group('download_list').strip()
3781
3782 # Get all of the links from the page
3783 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3784 links = re.findall(LINK_RE, download_list_html)
3785 if(len(links) == 0):
3786 raise ExtractorError(u'ERROR: no known formats available for video')
3787
3788 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3789
3790 formats = []
3791 for link in links:
3792
3793 # A link looks like this:
3794 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3795 # A path looks like this:
3796 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3797 video_url = unescapeHTML( link )
3798 path = compat_urllib_parse_urlparse( video_url ).path
3799 extension = os.path.splitext( path )[1][1:]
3800 format = path.split('/')[4].split('_')[:2]
3801 size = format[0]
3802 bitrate = format[1]
3803 format = "-".join( format )
3804 title = u'%s-%s-%s' % (video_title, size, bitrate)
3805
3806 formats.append({
3807 'id': video_id,
3808 'url': video_url,
3809 'uploader': video_uploader,
3810 'upload_date': upload_date,
3811 'title': title,
3812 'ext': extension,
3813 'format': format,
3814 'thumbnail': None,
3815 'description': None,
3816 'player_url': None
3817 })
3818
3819 if self._downloader.params.get('listformats', None):
3820 self._print_formats(formats)
3821 return
3822
3823 req_format = self._downloader.params.get('format', None)
3824 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3825
3826 if req_format is None or req_format == 'best':
3827 return [formats[0]]
3828 elif req_format == 'worst':
3829 return [formats[-1]]
3830 elif req_format in ('-1', 'all'):
3831 return formats
3832 else:
3833 format = self._specific( req_format, formats )
3834 if result is None:
3835 self._downloader.trouble(u'ERROR: requested format not available')
3836 return
3837 return [format]
3838
3839
3840
3841 class PornotubeIE(InfoExtractor):
3842 """Information extractor for pornotube.com."""
3843 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3844
3845 def _real_extract(self, url):
3846 mobj = re.match(self._VALID_URL, url)
3847 if mobj is None:
3848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3849 return
3850
3851 video_id = mobj.group('videoid')
3852 video_title = mobj.group('title')
3853
3854 # Get webpage content
3855 webpage = self._download_webpage(url, video_id)
3856
3857 # Get the video URL
3858 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3859 result = re.search(VIDEO_URL_RE, webpage)
3860 if result is None:
3861 self._downloader.trouble(u'ERROR: unable to extract video url')
3862 return
3863 video_url = compat_urllib_parse.unquote(result.group('url'))
3864
3865 #Get the uploaded date
3866 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3867 result = re.search(VIDEO_UPLOADED_RE, webpage)
3868 if result is None:
3869 self._downloader.trouble(u'ERROR: unable to extract video title')
3870 return
3871 upload_date = result.group('date')
3872
3873 info = {'id': video_id,
3874 'url': video_url,
3875 'uploader': None,
3876 'upload_date': upload_date,
3877 'title': video_title,
3878 'ext': 'flv',
3879 'format': 'flv'}
3880
3881 return [info]
3882
3883 class YouJizzIE(InfoExtractor):
3884 """Information extractor for youjizz.com."""
3885 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3886
3887 def _real_extract(self, url):
3888 mobj = re.match(self._VALID_URL, url)
3889 if mobj is None:
3890 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3891 return
3892
3893 video_id = mobj.group('videoid')
3894
3895 # Get webpage content
3896 webpage = self._download_webpage(url, video_id)
3897
3898 # Get the video title
3899 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3900 if result is None:
3901 raise ExtractorError(u'ERROR: unable to extract video title')
3902 video_title = result.group('title').strip()
3903
3904 # Get the embed page
3905 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3906 if result is None:
3907 raise ExtractorError(u'ERROR: unable to extract embed page')
3908
3909 embed_page_url = result.group(0).strip()
3910 video_id = result.group('videoid')
3911
3912 webpage = self._download_webpage(embed_page_url, video_id)
3913
3914 # Get the video URL
3915 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3916 if result is None:
3917 raise ExtractorError(u'ERROR: unable to extract video url')
3918 video_url = result.group('source')
3919
3920 info = {'id': video_id,
3921 'url': video_url,
3922 'title': video_title,
3923 'ext': 'flv',
3924 'format': 'flv',
3925 'player_url': embed_page_url}
3926
3927 return [info]
3928
3929 class EightTracksIE(InfoExtractor):
3930 IE_NAME = '8tracks'
3931 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3932
3933 def _real_extract(self, url):
3934 mobj = re.match(self._VALID_URL, url)
3935 if mobj is None:
3936 raise ExtractorError(u'Invalid URL: %s' % url)
3937 playlist_id = mobj.group('id')
3938
3939 webpage = self._download_webpage(url, playlist_id)
3940
3941 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3942 if not m:
3943 raise ExtractorError(u'Cannot find trax information')
3944 json_like = m.group(1)
3945 data = json.loads(json_like)
3946
3947 session = str(random.randint(0, 1000000000))
3948 mix_id = data['id']
3949 track_count = data['tracks_count']
3950 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3951 next_url = first_url
3952 res = []
3953 for i in itertools.count():
3954 api_json = self._download_webpage(next_url, playlist_id,
3955 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3956 errnote=u'Failed to download song information')
3957 api_data = json.loads(api_json)
3958 track_data = api_data[u'set']['track']
3959 info = {
3960 'id': track_data['id'],
3961 'url': track_data['track_file_stream_url'],
3962 'title': track_data['performer'] + u' - ' + track_data['name'],
3963 'raw_title': track_data['name'],
3964 'uploader_id': data['user']['login'],
3965 'ext': 'm4a',
3966 }
3967 res.append(info)
3968 if api_data['set']['at_last_track']:
3969 break
3970 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3971 return res
3972
3973 class KeekIE(InfoExtractor):
3974 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3975 IE_NAME = u'keek'
3976
3977 def _real_extract(self, url):
3978 m = re.match(self._VALID_URL, url)
3979 video_id = m.group('videoID')
3980 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3981 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3982 webpage = self._download_webpage(url, video_id)
3983 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3984 title = unescapeHTML(m.group('title'))
3985 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3986 uploader = unescapeHTML(m.group('uploader'))
3987 info = {
3988 'id':video_id,
3989 'url':video_url,
3990 'ext': 'mp4',
3991 'title': title,
3992 'thumbnail': thumbnail,
3993 'uploader': uploader
3994 }
3995 return [info]
3996
3997 class TEDIE(InfoExtractor):
3998 _VALID_URL=r'''http://www.ted.com/
3999 (
4000 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4001 |
4002 ((?P<type_talk>talks)) # We have a simple talk
4003 )
4004 /(?P<name>\w+) # Here goes the name and then ".html"
4005 '''
4006
4007 def suitable(self, url):
4008 """Receives a URL and returns True if suitable for this IE."""
4009 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4010
4011 def _real_extract(self, url):
4012 m=re.match(self._VALID_URL, url, re.VERBOSE)
4013 if m.group('type_talk'):
4014 return [self._talk_info(url)]
4015 else :
4016 playlist_id=m.group('playlist_id')
4017 name=m.group('name')
4018 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4019 return self._playlist_videos_info(url,name,playlist_id)
4020
4021 def _talk_video_link(self,mediaSlug):
4022 '''Returns the video link for that mediaSlug'''
4023 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4024
4025 def _playlist_videos_info(self,url,name,playlist_id=0):
4026 '''Returns the videos of the playlist'''
4027 video_RE=r'''
4028 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4029 ([.\s]*?)data-playlist_item_id="(\d+)"
4030 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4031 '''
4032 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4033 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4034 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4035 m_names=re.finditer(video_name_RE,webpage)
4036 info=[]
4037 for m_video, m_name in zip(m_videos,m_names):
4038 video_dic={
4039 'id': m_video.group('video_id'),
4040 'url': self._talk_video_link(m_video.group('mediaSlug')),
4041 'ext': 'mp4',
4042 'title': m_name.group('fullname')
4043 }
4044 info.append(video_dic)
4045 return info
4046 def _talk_info(self, url, video_id=0):
4047 """Return the video for the talk in the url"""
4048 m=re.match(self._VALID_URL, url,re.VERBOSE)
4049 videoName=m.group('name')
4050 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4051 # If the url includes the language we get the title translated
4052 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4053 title=re.search(title_RE, webpage).group('title')
4054 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4055 "id":(?P<videoID>[\d]+).*?
4056 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4057 info_match=re.search(info_RE,webpage,re.VERBOSE)
4058 video_id=info_match.group('videoID')
4059 mediaSlug=info_match.group('mediaSlug')
4060 video_url=self._talk_video_link(mediaSlug)
4061 info = {
4062 'id': video_id,
4063 'url': video_url,
4064 'ext': 'mp4',
4065 'title': title
4066 }
4067 return info
4068
4069 class MySpassIE(InfoExtractor):
4070 _VALID_URL = r'http://www.myspass.de/.*'
4071
4072 def _real_extract(self, url):
4073 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4074
4075 # video id is the last path element of the URL
4076 # usually there is a trailing slash, so also try the second but last
4077 url_path = compat_urllib_parse_urlparse(url).path
4078 url_parent_path, video_id = os.path.split(url_path)
4079 if not video_id:
4080 _, video_id = os.path.split(url_parent_path)
4081
4082 # get metadata
4083 metadata_url = META_DATA_URL_TEMPLATE % video_id
4084 metadata_text = self._download_webpage(metadata_url, video_id)
4085 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4086
4087 # extract values from metadata
4088 url_flv_el = metadata.find('url_flv')
4089 if url_flv_el is None:
4090 self._downloader.trouble(u'ERROR: unable to extract download url')
4091 return
4092 video_url = url_flv_el.text
4093 extension = os.path.splitext(video_url)[1][1:]
4094 title_el = metadata.find('title')
4095 if title_el is None:
4096 self._downloader.trouble(u'ERROR: unable to extract title')
4097 return
4098 title = title_el.text
4099 format_id_el = metadata.find('format_id')
4100 if format_id_el is None:
4101 format = ext
4102 else:
4103 format = format_id_el.text
4104 description_el = metadata.find('description')
4105 if description_el is not None:
4106 description = description_el.text
4107 else:
4108 description = None
4109 imagePreview_el = metadata.find('imagePreview')
4110 if imagePreview_el is not None:
4111 thumbnail = imagePreview_el.text
4112 else:
4113 thumbnail = None
4114 info = {
4115 'id': video_id,
4116 'url': video_url,
4117 'title': title,
4118 'ext': extension,
4119 'format': format,
4120 'thumbnail': thumbnail,
4121 'description': description
4122 }
4123 return [info]
4124
4125 def gen_extractors():
4126 """ Return a list of an instance of every supported extractor.
4127 The order does matter; the first extractor matched is the one handling the URL.
4128 """
4129 return [
4130 YoutubePlaylistIE(),
4131 YoutubeChannelIE(),
4132 YoutubeUserIE(),
4133 YoutubeSearchIE(),
4134 YoutubeIE(),
4135 MetacafeIE(),
4136 DailymotionIE(),
4137 GoogleSearchIE(),
4138 PhotobucketIE(),
4139 YahooIE(),
4140 YahooSearchIE(),
4141 DepositFilesIE(),
4142 FacebookIE(),
4143 BlipTVUserIE(),
4144 BlipTVIE(),
4145 VimeoIE(),
4146 MyVideoIE(),
4147 ComedyCentralIE(),
4148 EscapistIE(),
4149 CollegeHumorIE(),
4150 XVideosIE(),
4151 SoundcloudIE(),
4152 InfoQIE(),
4153 MixcloudIE(),
4154 StanfordOpenClassroomIE(),
4155 MTVIE(),
4156 YoukuIE(),
4157 XNXXIE(),
4158 YouJizzIE(),
4159 PornotubeIE(),
4160 YouPornIE(),
4161 GooglePlusIE(),
4162 ArteTvIE(),
4163 NBAIE(),
4164 JustinTVIE(),
4165 FunnyOrDieIE(),
4166 TweetReelIE(),
4167 SteamIE(),
4168 UstreamIE(),
4169 RBMARadioIE(),
4170 EightTracksIE(),
4171 KeekIE(),
4172 TEDIE(),
4173 MySpassIE(),
4174 GenericIE()
4175 ]
4176
4177