]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
SoundcloudSetIE: Use upload_date in the unified format (fixes #812)
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
120 try:
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
146
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
154
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
158
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
162
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
168 return video_info
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
173 'url': url,
174 'ie_key': ie}
175 return video_info
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
179 'entries': entries}
180 if playlist_id:
181 video_info['id'] = playlist_id
182 if playlist_title:
183 video_info['title'] = playlist_title
184 return video_info
185
186
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
189
190 _VALID_URL = r"""^
191 (
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
202 v=
203 )
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
208 $"""
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
218 '13': '3gp',
219 '17': 'mp4',
220 '18': 'mp4',
221 '22': 'mp4',
222 '37': 'mp4',
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
224 '43': 'webm',
225 '44': 'webm',
226 '45': 'webm',
227 '46': 'webm',
228 }
229 _video_dimensions = {
230 '5': '240x400',
231 '6': '???',
232 '13': '???',
233 '17': '144x176',
234 '18': '360x640',
235 '22': '720x1280',
236 '34': '360x640',
237 '35': '480x854',
238 '37': '1080x1920',
239 '38': '3072x4096',
240 '43': '360x640',
241 '44': '480x854',
242 '45': '720x1280',
243 '46': '1080x1920',
244 }
245 IE_NAME = u'youtube'
246
247 @classmethod
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
252
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
256
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
260
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
264
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
268
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
272
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
276
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
281
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
285
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
289
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
293
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
297 try:
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
305 return sub_lang_list
306
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
310
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
312 """
313 Return tuple:
314 (error_message, sub_lang, sub)
315 """
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
318 'lang': sub_lang,
319 'name': sub_name,
320 'v': video_id,
321 'fmt': format,
322 })
323 url = 'http://www.youtube.com/api/timedtext?' + params
324 try:
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
328 if not sub:
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
331
332 def _extract_subtitle(self, video_id):
333 """
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
336 """
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
344 sub_lang = 'en'
345 else:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
349
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
351 return [subtitle]
352
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
358 subtitles = []
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
362 return subtitles
363
364 def _print_formats(self, formats):
365 print('Available formats:')
366 for x in formats:
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
368
369 def _real_initialize(self):
370 if self._downloader is None:
371 return
372
373 username = None
374 password = None
375 downloader_params = self._downloader.params
376
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
382 try:
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 if info is not None:
385 username = info[0]
386 password = info[2]
387 else:
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
391 return
392
393 # Set language
394 request = compat_urllib_request.Request(self._LANG_URL)
395 try:
396 self.report_lang()
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
400 return
401
402 # No authentication to be performed
403 if username is None:
404 return
405
406 request = compat_urllib_request.Request(self._LOGIN_URL)
407 try:
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 return
412
413 galx = None
414 dsh = None
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
416 if match:
417 galx = match.group(1)
418
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
420 if match:
421 dsh = match.group(1)
422
423 # Log in
424 login_form_strs = {
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
426 u'Email': username,
427 u'GALX': galx,
428 u'Passwd': password,
429 u'PersistentCookie': u'yes',
430 u'_utf8': u'霱',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
434 u'dnConn': u'',
435 u'dsh': dsh,
436 u'pstMsg': u'0',
437 u'rmShown': u'1',
438 u'secTok': u'',
439 u'signIn': u'Sign in',
440 u'timeStmp': u'',
441 u'service': u'youtube',
442 u'uilel': u'3',
443 u'hl': u'en_US',
444 }
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
446 # chokes on unicode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
450 try:
451 self.report_login()
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
455 return
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
458 return
459
460 # Confirm age
461 age_form = {
462 'next_url': '/',
463 'action_confirm': 'Confirm',
464 }
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
466 try:
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
471 return
472
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
475 if mobj is None:
476 self._downloader.report_error(u'invalid URL: %s' % url)
477 return
478 video_id = mobj.group(2)
479 return video_id
480
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
484 if mobj:
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
487
488 # Get video webpage
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
492 try:
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
496 return
497
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
499
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
502 if mobj is not None:
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 else:
505 player_url = None
506
507 # Get video info
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
513 note=False,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
517 break
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
521 else:
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
523 return
524
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
528 return
529
530 # Start extracting information
531 self.report_information_extraction(video_id)
532
533 # uploader
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
536 return
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
538
539 # uploader_id
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 if mobj is not None:
543 video_uploader_id = mobj.group(1)
544 else:
545 self._downloader.report_warning(u'unable to extract uploader nickname')
546
547 # title
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
550 return
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552
553 # thumbnail image
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
556 video_thumbnail = ''
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
559
560 # upload date
561 upload_date = None
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
563 if mobj is not None:
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
566
567 # description
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
571 else:
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
573 if fd_mobj:
574 video_description = unescapeHTML(fd_mobj.group(1))
575 else:
576 video_description = u''
577
578 # subtitles
579 video_subtitles = None
580
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
583 if video_subtitles:
584 (sub_error, sub_lang, sub) = video_subtitles[0]
585 if sub_error:
586 self._downloader.report_error(sub_error)
587
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
592 if sub_error:
593 self._downloader.report_error(sub_error)
594
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
597 return
598
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = ''
602 else:
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604
605 # token
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
607
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
610
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
619
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
624 else:
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
631 return
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
638 else:
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
644 if rf in url_map:
645 video_url_list = [(rf, url_map[rf])]
646 break
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
649 else:
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
651
652 results = []
653 for format_param, video_real_url in video_url_list:
654 # Extension
655 video_extension = self._video_extensions.get(format_param, 'flv')
656
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
659
660 results.append({
661 'id': video_id,
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
674 })
675 return results
676
677
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
680
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
685
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
689
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
693 try:
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
698 return
699
700 # Confirm age
701 disclaimer_form = {
702 'filters': '0',
703 'submit': "Continue - I'm over 18",
704 }
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706 try:
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
711 return
712
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
716 if mobj is None:
717 self._downloader.report_error(u'invalid URL: %s' % url)
718 return
719
720 video_id = mobj.group(1)
721
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
726
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
729
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
733 if mobj is not None:
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
736
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
739 if mobj is None:
740 video_url = mediaURL
741 else:
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
744 else:
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
746 if mobj is None:
747 self._downloader.report_error(u'unable to extract media URL')
748 return
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
752 return
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
754 if mobj is None:
755 self._downloader.report_error(u'unable to extract media URL')
756 return
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
760
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
762 if mobj is None:
763 self._downloader.report_error(u'unable to extract title')
764 return
765 video_title = mobj.group(1).decode('utf-8')
766
767 mobj = re.search(r'submitter=(.*?);', webpage)
768 if mobj is None:
769 self._downloader.report_error(u'unable to extract uploader nickname')
770 return
771 video_uploader = mobj.group(1)
772
773 return [{
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
777 'upload_date': None,
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
780 }]
781
782
783 class DailymotionIE(InfoExtractor):
784 """Information Extractor for Dailymotion"""
785
786 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787 IE_NAME = u'dailymotion'
788
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
792 if mobj is None:
793 self._downloader.report_error(u'invalid URL: %s' % url)
794 return
795
796 video_id = mobj.group(1).split('_')[0].split('?')[0]
797
798 video_extension = 'mp4'
799
800 # Retrieve video webpage to extract further information
801 request = compat_urllib_request.Request(url)
802 request.add_header('Cookie', 'family_filter=off')
803 webpage = self._download_webpage(request, video_id)
804
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 if mobj is None:
809 self._downloader.report_error(u'unable to extract media URL')
810 return
811 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812
813 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
814 if key in flashvars:
815 max_quality = key
816 self.to_screen(u'Using %s' % key)
817 break
818 else:
819 self._downloader.report_error(u'unable to extract video URL')
820 return
821
822 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 if mobj is None:
824 self._downloader.report_error(u'unable to extract video URL')
825 return
826
827 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828
829 # TODO: support choosing qualities
830
831 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 if mobj is None:
833 self._downloader.report_error(u'unable to extract title')
834 return
835 video_title = unescapeHTML(mobj.group('title'))
836
837 video_uploader = None
838 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 if mobj is None:
840 # lookin for official user
841 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
842 if mobj_official is None:
843 self._downloader.report_warning(u'unable to extract uploader nickname')
844 else:
845 video_uploader = mobj_official.group(1)
846 else:
847 video_uploader = mobj.group(1)
848
849 video_upload_date = None
850 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 if mobj is not None:
852 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
853
854 return [{
855 'id': video_id,
856 'url': video_url,
857 'uploader': video_uploader,
858 'upload_date': video_upload_date,
859 'title': video_title,
860 'ext': video_extension,
861 }]
862
863
864 class PhotobucketIE(InfoExtractor):
865 """Information extractor for photobucket.com."""
866
867 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
868 IE_NAME = u'photobucket'
869
870 def _real_extract(self, url):
871 # Extract id from URL
872 mobj = re.match(self._VALID_URL, url)
873 if mobj is None:
874 self._downloader.report_error(u'Invalid URL: %s' % url)
875 return
876
877 video_id = mobj.group(1)
878
879 video_extension = 'flv'
880
881 # Retrieve video webpage to extract further information
882 request = compat_urllib_request.Request(url)
883 try:
884 self.report_download_webpage(video_id)
885 webpage = compat_urllib_request.urlopen(request).read()
886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
887 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
888 return
889
890 # Extract URL, uploader, and title from webpage
891 self.report_extraction(video_id)
892 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
893 if mobj is None:
894 self._downloader.report_error(u'unable to extract media URL')
895 return
896 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
897
898 video_url = mediaURL
899
900 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
901 if mobj is None:
902 self._downloader.report_error(u'unable to extract title')
903 return
904 video_title = mobj.group(1).decode('utf-8')
905
906 video_uploader = mobj.group(2).decode('utf-8')
907
908 return [{
909 'id': video_id.decode('utf-8'),
910 'url': video_url.decode('utf-8'),
911 'uploader': video_uploader,
912 'upload_date': None,
913 'title': video_title,
914 'ext': video_extension.decode('utf-8'),
915 }]
916
917
918 class YahooIE(InfoExtractor):
919 """Information extractor for video.yahoo.com."""
920
921 _WORKING = False
922 # _VALID_URL matches all Yahoo! Video URLs
923 # _VPAGE_URL matches only the extractable '/watch/' URLs
924 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
925 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
926 IE_NAME = u'video.yahoo'
927
928 def _real_extract(self, url, new_video=True):
929 # Extract ID from URL
930 mobj = re.match(self._VALID_URL, url)
931 if mobj is None:
932 self._downloader.report_error(u'Invalid URL: %s' % url)
933 return
934
935 video_id = mobj.group(2)
936 video_extension = 'flv'
937
938 # Rewrite valid but non-extractable URLs as
939 # extractable English language /watch/ URLs
940 if re.match(self._VPAGE_URL, url) is None:
941 request = compat_urllib_request.Request(url)
942 try:
943 webpage = compat_urllib_request.urlopen(request).read()
944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
945 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
946 return
947
948 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
949 if mobj is None:
950 self._downloader.report_error(u'Unable to extract id field')
951 return
952 yahoo_id = mobj.group(1)
953
954 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
955 if mobj is None:
956 self._downloader.report_error(u'Unable to extract vid field')
957 return
958 yahoo_vid = mobj.group(1)
959
960 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
961 return self._real_extract(url, new_video=False)
962
963 # Retrieve video webpage to extract further information
964 request = compat_urllib_request.Request(url)
965 try:
966 self.report_download_webpage(video_id)
967 webpage = compat_urllib_request.urlopen(request).read()
968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
969 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
970 return
971
972 # Extract uploader and title from webpage
973 self.report_extraction(video_id)
974 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
975 if mobj is None:
976 self._downloader.report_error(u'unable to extract video title')
977 return
978 video_title = mobj.group(1).decode('utf-8')
979
980 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
981 if mobj is None:
982 self._downloader.report_error(u'unable to extract video uploader')
983 return
984 video_uploader = mobj.group(1).decode('utf-8')
985
986 # Extract video thumbnail
987 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
988 if mobj is None:
989 self._downloader.report_error(u'unable to extract video thumbnail')
990 return
991 video_thumbnail = mobj.group(1).decode('utf-8')
992
993 # Extract video description
994 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
995 if mobj is None:
996 self._downloader.report_error(u'unable to extract video description')
997 return
998 video_description = mobj.group(1).decode('utf-8')
999 if not video_description:
1000 video_description = 'No description available.'
1001
1002 # Extract video height and width
1003 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1004 if mobj is None:
1005 self._downloader.report_error(u'unable to extract video height')
1006 return
1007 yv_video_height = mobj.group(1)
1008
1009 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1010 if mobj is None:
1011 self._downloader.report_error(u'unable to extract video width')
1012 return
1013 yv_video_width = mobj.group(1)
1014
1015 # Retrieve video playlist to extract media URL
1016 # I'm not completely sure what all these options are, but we
1017 # seem to need most of them, otherwise the server sends a 401.
1018 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1019 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1020 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1021 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1022 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1023 try:
1024 self.report_download_webpage(video_id)
1025 webpage = compat_urllib_request.urlopen(request).read()
1026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1028 return
1029
1030 # Extract media URL from playlist XML
1031 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1032 if mobj is None:
1033 self._downloader.report_error(u'Unable to extract media URL')
1034 return
1035 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1036 video_url = unescapeHTML(video_url)
1037
1038 return [{
1039 'id': video_id.decode('utf-8'),
1040 'url': video_url,
1041 'uploader': video_uploader,
1042 'upload_date': None,
1043 'title': video_title,
1044 'ext': video_extension.decode('utf-8'),
1045 'thumbnail': video_thumbnail.decode('utf-8'),
1046 'description': video_description,
1047 }]
1048
1049
1050 class VimeoIE(InfoExtractor):
1051 """Information extractor for vimeo.com."""
1052
1053 # _VALID_URL matches Vimeo URLs
1054 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1055 IE_NAME = u'vimeo'
1056
1057 def _real_extract(self, url, new_video=True):
1058 # Extract ID from URL
1059 mobj = re.match(self._VALID_URL, url)
1060 if mobj is None:
1061 self._downloader.report_error(u'Invalid URL: %s' % url)
1062 return
1063
1064 video_id = mobj.group('id')
1065 if not mobj.group('proto'):
1066 url = 'https://' + url
1067 if mobj.group('direct_link'):
1068 url = 'https://vimeo.com/' + video_id
1069
1070 # Retrieve video webpage to extract further information
1071 request = compat_urllib_request.Request(url, None, std_headers)
1072 try:
1073 self.report_download_webpage(video_id)
1074 webpage_bytes = compat_urllib_request.urlopen(request).read()
1075 webpage = webpage_bytes.decode('utf-8')
1076 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1077 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1078 return
1079
1080 # Now we begin extracting as much information as we can from what we
1081 # retrieved. First we extract the information common to all extractors,
1082 # and latter we extract those that are Vimeo specific.
1083 self.report_extraction(video_id)
1084
1085 # Extract the config JSON
1086 try:
1087 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1088 config = json.loads(config)
1089 except:
1090 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1091 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1092 else:
1093 self._downloader.report_error(u'unable to extract info section')
1094 return
1095
1096 # Extract title
1097 video_title = config["video"]["title"]
1098
1099 # Extract uploader and uploader_id
1100 video_uploader = config["video"]["owner"]["name"]
1101 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1102
1103 # Extract video thumbnail
1104 video_thumbnail = config["video"]["thumbnail"]
1105
1106 # Extract video description
1107 video_description = get_element_by_attribute("itemprop", "description", webpage)
1108 if video_description: video_description = clean_html(video_description)
1109 else: video_description = u''
1110
1111 # Extract upload date
1112 video_upload_date = None
1113 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1114 if mobj is not None:
1115 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1116
1117 # Vimeo specific: extract request signature and timestamp
1118 sig = config['request']['signature']
1119 timestamp = config['request']['timestamp']
1120
1121 # Vimeo specific: extract video codec and quality information
1122 # First consider quality, then codecs, then take everything
1123 # TODO bind to format param
1124 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1125 files = { 'hd': [], 'sd': [], 'other': []}
1126 for codec_name, codec_extension in codecs:
1127 if codec_name in config["video"]["files"]:
1128 if 'hd' in config["video"]["files"][codec_name]:
1129 files['hd'].append((codec_name, codec_extension, 'hd'))
1130 elif 'sd' in config["video"]["files"][codec_name]:
1131 files['sd'].append((codec_name, codec_extension, 'sd'))
1132 else:
1133 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1134
1135 for quality in ('hd', 'sd', 'other'):
1136 if len(files[quality]) > 0:
1137 video_quality = files[quality][0][2]
1138 video_codec = files[quality][0][0]
1139 video_extension = files[quality][0][1]
1140 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1141 break
1142 else:
1143 self._downloader.report_error(u'no known codec found')
1144 return
1145
1146 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1148
1149 return [{
1150 'id': video_id,
1151 'url': video_url,
1152 'uploader': video_uploader,
1153 'uploader_id': video_uploader_id,
1154 'upload_date': video_upload_date,
1155 'title': video_title,
1156 'ext': video_extension,
1157 'thumbnail': video_thumbnail,
1158 'description': video_description,
1159 }]
1160
1161
1162 class ArteTvIE(InfoExtractor):
1163 """arte.tv information extractor."""
1164
1165 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166 _LIVE_URL = r'index-[0-9]+\.html$'
1167
1168 IE_NAME = u'arte.tv'
1169
1170 def fetch_webpage(self, url):
1171 request = compat_urllib_request.Request(url)
1172 try:
1173 self.report_download_webpage(url)
1174 webpage = compat_urllib_request.urlopen(request).read()
1175 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177 return
1178 except ValueError as err:
1179 self._downloader.report_error(u'Invalid URL: %s' % url)
1180 return
1181 return webpage
1182
1183 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184 page = self.fetch_webpage(url)
1185 mobj = re.search(regex, page, regexFlags)
1186 info = {}
1187
1188 if mobj is None:
1189 self._downloader.report_error(u'Invalid URL: %s' % url)
1190 return
1191
1192 for (i, key, err) in matchTuples:
1193 if mobj.group(i) is None:
1194 self._downloader.report_error(err)
1195 return
1196 else:
1197 info[key] = mobj.group(i)
1198
1199 return info
1200
1201 def extractLiveStream(self, url):
1202 video_lang = url.split('/')[-4]
1203 info = self.grep_webpage(
1204 url,
1205 r'src="(.*?/videothek_js.*?\.js)',
1206 0,
1207 [
1208 (1, 'url', u'Invalid URL: %s' % url)
1209 ]
1210 )
1211 http_host = url.split('/')[2]
1212 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1213 info = self.grep_webpage(
1214 next_url,
1215 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216 '(http://.*?\.swf).*?' +
1217 '(rtmp://.*?)\'',
1218 re.DOTALL,
1219 [
1220 (1, 'path', u'could not extract video path: %s' % url),
1221 (2, 'player', u'could not extract video player: %s' % url),
1222 (3, 'url', u'could not extract video url: %s' % url)
1223 ]
1224 )
1225 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1226
1227 def extractPlus7Stream(self, url):
1228 video_lang = url.split('/')[-3]
1229 info = self.grep_webpage(
1230 url,
1231 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232 0,
1233 [
1234 (1, 'url', u'Invalid URL: %s' % url)
1235 ]
1236 )
1237 next_url = compat_urllib_parse.unquote(info.get('url'))
1238 info = self.grep_webpage(
1239 next_url,
1240 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241 0,
1242 [
1243 (1, 'url', u'Could not find <video> tag: %s' % url)
1244 ]
1245 )
1246 next_url = compat_urllib_parse.unquote(info.get('url'))
1247
1248 info = self.grep_webpage(
1249 next_url,
1250 r'<video id="(.*?)".*?>.*?' +
1251 '<name>(.*?)</name>.*?' +
1252 '<dateVideo>(.*?)</dateVideo>.*?' +
1253 '<url quality="hd">(.*?)</url>',
1254 re.DOTALL,
1255 [
1256 (1, 'id', u'could not extract video id: %s' % url),
1257 (2, 'title', u'could not extract video title: %s' % url),
1258 (3, 'date', u'could not extract video date: %s' % url),
1259 (4, 'url', u'could not extract video url: %s' % url)
1260 ]
1261 )
1262
1263 return {
1264 'id': info.get('id'),
1265 'url': compat_urllib_parse.unquote(info.get('url')),
1266 'uploader': u'arte.tv',
1267 'upload_date': info.get('date'),
1268 'title': info.get('title').decode('utf-8'),
1269 'ext': u'mp4',
1270 'format': u'NA',
1271 'player_url': None,
1272 }
1273
1274 def _real_extract(self, url):
1275 video_id = url.split('/')[-1]
1276 self.report_extraction(video_id)
1277
1278 if re.search(self._LIVE_URL, video_id) is not None:
1279 self.extractLiveStream(url)
1280 return
1281 else:
1282 info = self.extractPlus7Stream(url)
1283
1284 return [info]
1285
1286
1287 class GenericIE(InfoExtractor):
1288 """Generic last-resort information extractor."""
1289
1290 _VALID_URL = r'.*'
1291 IE_NAME = u'generic'
1292
1293 def report_download_webpage(self, video_id):
1294 """Report webpage download."""
1295 if not self._downloader.params.get('test', False):
1296 self._downloader.report_warning(u'Falling back on generic information extractor.')
1297 super(GenericIE, self).report_download_webpage(video_id)
1298
1299 def report_following_redirect(self, new_url):
1300 """Report information extraction."""
1301 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1302
1303 def _test_redirect(self, url):
1304 """Check if it is a redirect, like url shorteners, in case return the new url."""
1305 class HeadRequest(compat_urllib_request.Request):
1306 def get_method(self):
1307 return "HEAD"
1308
1309 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1310 """
1311 Subclass the HTTPRedirectHandler to make it use our
1312 HeadRequest also on the redirected URL
1313 """
1314 def redirect_request(self, req, fp, code, msg, headers, newurl):
1315 if code in (301, 302, 303, 307):
1316 newurl = newurl.replace(' ', '%20')
1317 newheaders = dict((k,v) for k,v in req.headers.items()
1318 if k.lower() not in ("content-length", "content-type"))
1319 return HeadRequest(newurl,
1320 headers=newheaders,
1321 origin_req_host=req.get_origin_req_host(),
1322 unverifiable=True)
1323 else:
1324 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1325
1326 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1327 """
1328 Fallback to GET if HEAD is not allowed (405 HTTP error)
1329 """
1330 def http_error_405(self, req, fp, code, msg, headers):
1331 fp.read()
1332 fp.close()
1333
1334 newheaders = dict((k,v) for k,v in req.headers.items()
1335 if k.lower() not in ("content-length", "content-type"))
1336 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1337 headers=newheaders,
1338 origin_req_host=req.get_origin_req_host(),
1339 unverifiable=True))
1340
1341 # Build our opener
1342 opener = compat_urllib_request.OpenerDirector()
1343 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1344 HTTPMethodFallback, HEADRedirectHandler,
1345 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1346 opener.add_handler(handler())
1347
1348 response = opener.open(HeadRequest(url))
1349 new_url = response.geturl()
1350
1351 if url == new_url:
1352 return False
1353
1354 self.report_following_redirect(new_url)
1355 return new_url
1356
1357 def _real_extract(self, url):
1358 new_url = self._test_redirect(url)
1359 if new_url: return [self.url_result(new_url)]
1360
1361 video_id = url.split('/')[-1]
1362 try:
1363 webpage = self._download_webpage(url, video_id)
1364 except ValueError as err:
1365 # since this is the last-resort InfoExtractor, if
1366 # this error is thrown, it'll be thrown here
1367 self._downloader.report_error(u'Invalid URL: %s' % url)
1368 return
1369
1370 self.report_extraction(video_id)
1371 # Start with something easy: JW Player in SWFObject
1372 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1373 if mobj is None:
1374 # Broaden the search a little bit
1375 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1376 if mobj is None:
1377 # Broaden the search a little bit: JWPlayer JS loader
1378 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1379 if mobj is None:
1380 self._downloader.report_error(u'Invalid URL: %s' % url)
1381 return
1382
1383 # It's possible that one of the regexes
1384 # matched, but returned an empty group:
1385 if mobj.group(1) is None:
1386 self._downloader.report_error(u'Invalid URL: %s' % url)
1387 return
1388
1389 video_url = compat_urllib_parse.unquote(mobj.group(1))
1390 video_id = os.path.basename(video_url)
1391
1392 # here's a fun little line of code for you:
1393 video_extension = os.path.splitext(video_id)[1][1:]
1394 video_id = os.path.splitext(video_id)[0]
1395
1396 # it's tempting to parse this further, but you would
1397 # have to take into account all the variations like
1398 # Video Title - Site Name
1399 # Site Name | Video Title
1400 # Video Title - Tagline | Site Name
1401 # and so on and so forth; it's just not practical
1402 mobj = re.search(r'<title>(.*)</title>', webpage)
1403 if mobj is None:
1404 self._downloader.report_error(u'unable to extract title')
1405 return
1406 video_title = mobj.group(1)
1407
1408 # video uploader is domain name
1409 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410 if mobj is None:
1411 self._downloader.report_error(u'unable to extract title')
1412 return
1413 video_uploader = mobj.group(1)
1414
1415 return [{
1416 'id': video_id,
1417 'url': video_url,
1418 'uploader': video_uploader,
1419 'upload_date': None,
1420 'title': video_title,
1421 'ext': video_extension,
1422 }]
1423
1424
1425 class YoutubeSearchIE(InfoExtractor):
1426 """Information Extractor for YouTube search queries."""
1427 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1428 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1429 _max_youtube_results = 1000
1430 IE_NAME = u'youtube:search'
1431
1432 def report_download_page(self, query, pagenum):
1433 """Report attempt to download search page with given number."""
1434 query = query.decode(preferredencoding())
1435 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437 def _real_extract(self, query):
1438 mobj = re.match(self._VALID_URL, query)
1439 if mobj is None:
1440 self._downloader.report_error(u'invalid search query "%s"' % query)
1441 return
1442
1443 prefix, query = query.split(':')
1444 prefix = prefix[8:]
1445 query = query.encode('utf-8')
1446 if prefix == '':
1447 return self._get_n_results(query, 1)
1448 elif prefix == 'all':
1449 self._get_n_results(query, self._max_youtube_results)
1450 else:
1451 try:
1452 n = int(prefix)
1453 if n <= 0:
1454 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1455 return
1456 elif n > self._max_youtube_results:
1457 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1458 n = self._max_youtube_results
1459 return self._get_n_results(query, n)
1460 except ValueError: # parsing prefix as integer fails
1461 return self._get_n_results(query, 1)
1462
1463 def _get_n_results(self, query, n):
1464 """Get a specified number of results for a query"""
1465
1466 video_ids = []
1467 pagenum = 0
1468 limit = n
1469
1470 while (50 * pagenum) < limit:
1471 self.report_download_page(query, pagenum+1)
1472 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473 request = compat_urllib_request.Request(result_url)
1474 try:
1475 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1478 return
1479 api_response = json.loads(data)['data']
1480
1481 if not 'items' in api_response:
1482 self._downloader.report_error(u'[youtube] No video results')
1483 return
1484
1485 new_ids = list(video['id'] for video in api_response['items'])
1486 video_ids += new_ids
1487
1488 limit = min(n, api_response['totalItems'])
1489 pagenum += 1
1490
1491 if len(video_ids) > n:
1492 video_ids = video_ids[:n]
1493 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1494 return videos
1495
1496
1497 class GoogleSearchIE(InfoExtractor):
1498 """Information Extractor for Google Video search queries."""
1499 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1500 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1501 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1502 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1503 _max_google_results = 1000
1504 IE_NAME = u'video.google:search'
1505
1506 def report_download_page(self, query, pagenum):
1507 """Report attempt to download playlist page with given number."""
1508 query = query.decode(preferredencoding())
1509 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1510
1511 def _real_extract(self, query):
1512 mobj = re.match(self._VALID_URL, query)
1513 if mobj is None:
1514 self._downloader.report_error(u'invalid search query "%s"' % query)
1515 return
1516
1517 prefix, query = query.split(':')
1518 prefix = prefix[8:]
1519 query = query.encode('utf-8')
1520 if prefix == '':
1521 self._download_n_results(query, 1)
1522 return
1523 elif prefix == 'all':
1524 self._download_n_results(query, self._max_google_results)
1525 return
1526 else:
1527 try:
1528 n = int(prefix)
1529 if n <= 0:
1530 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1531 return
1532 elif n > self._max_google_results:
1533 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534 n = self._max_google_results
1535 self._download_n_results(query, n)
1536 return
1537 except ValueError: # parsing prefix as integer fails
1538 self._download_n_results(query, 1)
1539 return
1540
1541 def _download_n_results(self, query, n):
1542 """Downloads a specified number of results for a query"""
1543
1544 video_ids = []
1545 pagenum = 0
1546
1547 while True:
1548 self.report_download_page(query, pagenum)
1549 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550 request = compat_urllib_request.Request(result_url)
1551 try:
1552 page = compat_urllib_request.urlopen(request).read()
1553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1555 return
1556
1557 # Extract video identifiers
1558 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559 video_id = mobj.group(1)
1560 if video_id not in video_ids:
1561 video_ids.append(video_id)
1562 if len(video_ids) == n:
1563 # Specified n videos reached
1564 for id in video_ids:
1565 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1566 return
1567
1568 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569 for id in video_ids:
1570 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571 return
1572
1573 pagenum = pagenum + 1
1574
1575
1576 class YahooSearchIE(InfoExtractor):
1577 """Information Extractor for Yahoo! Video search queries."""
1578
1579 _WORKING = False
1580 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583 _MORE_PAGES_INDICATOR = r'\s*Next'
1584 _max_yahoo_results = 1000
1585 IE_NAME = u'video.yahoo:search'
1586
1587 def report_download_page(self, query, pagenum):
1588 """Report attempt to download playlist page with given number."""
1589 query = query.decode(preferredencoding())
1590 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1591
1592 def _real_extract(self, query):
1593 mobj = re.match(self._VALID_URL, query)
1594 if mobj is None:
1595 self._downloader.report_error(u'invalid search query "%s"' % query)
1596 return
1597
1598 prefix, query = query.split(':')
1599 prefix = prefix[8:]
1600 query = query.encode('utf-8')
1601 if prefix == '':
1602 self._download_n_results(query, 1)
1603 return
1604 elif prefix == 'all':
1605 self._download_n_results(query, self._max_yahoo_results)
1606 return
1607 else:
1608 try:
1609 n = int(prefix)
1610 if n <= 0:
1611 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1612 return
1613 elif n > self._max_yahoo_results:
1614 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615 n = self._max_yahoo_results
1616 self._download_n_results(query, n)
1617 return
1618 except ValueError: # parsing prefix as integer fails
1619 self._download_n_results(query, 1)
1620 return
1621
1622 def _download_n_results(self, query, n):
1623 """Downloads a specified number of results for a query"""
1624
1625 video_ids = []
1626 already_seen = set()
1627 pagenum = 1
1628
1629 while True:
1630 self.report_download_page(query, pagenum)
1631 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632 request = compat_urllib_request.Request(result_url)
1633 try:
1634 page = compat_urllib_request.urlopen(request).read()
1635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1637 return
1638
1639 # Extract video identifiers
1640 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641 video_id = mobj.group(1)
1642 if video_id not in already_seen:
1643 video_ids.append(video_id)
1644 already_seen.add(video_id)
1645 if len(video_ids) == n:
1646 # Specified n videos reached
1647 for id in video_ids:
1648 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649 return
1650
1651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652 for id in video_ids:
1653 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654 return
1655
1656 pagenum = pagenum + 1
1657
1658
1659 class YoutubePlaylistIE(InfoExtractor):
1660 """Information Extractor for YouTube playlists."""
1661
1662 _VALID_URL = r"""(?:
1663 (?:https?://)?
1664 (?:\w+\.)?
1665 youtube\.com/
1666 (?:
1667 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1668 \? (?:.*?&)*? (?:p|a|list)=
1669 | p/
1670 )
1671 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1672 .*
1673 |
1674 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1675 )"""
1676 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1677 _MAX_RESULTS = 50
1678 IE_NAME = u'youtube:playlist'
1679
1680 @classmethod
1681 def suitable(cls, url):
1682 """Receives a URL and returns True if suitable for this IE."""
1683 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1684
1685 def report_download_page(self, playlist_id, pagenum):
1686 """Report attempt to download playlist page with given number."""
1687 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1688
1689 def _real_extract(self, url):
1690 # Extract playlist id
1691 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1692 if mobj is None:
1693 self._downloader.report_error(u'invalid url: %s' % url)
1694 return
1695
1696 # Download playlist videos from API
1697 playlist_id = mobj.group(1) or mobj.group(2)
1698 page_num = 1
1699 videos = []
1700
1701 while True:
1702 self.report_download_page(playlist_id, page_num)
1703
1704 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1705 try:
1706 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1709 return
1710
1711 try:
1712 response = json.loads(page)
1713 except ValueError as err:
1714 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1715 return
1716
1717 if 'feed' not in response:
1718 self._downloader.report_error(u'Got a malformed response from YouTube API')
1719 return
1720 playlist_title = response['feed']['title']['$t']
1721 if 'entry' not in response['feed']:
1722 # Number of videos is a multiple of self._MAX_RESULTS
1723 break
1724
1725 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1726 for entry in response['feed']['entry']
1727 if 'content' in entry ]
1728
1729 if len(response['feed']['entry']) < self._MAX_RESULTS:
1730 break
1731 page_num += 1
1732
1733 videos = [v[1] for v in sorted(videos)]
1734
1735 url_results = [self.url_result(url, 'Youtube') for url in videos]
1736 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1737
1738
1739 class YoutubeChannelIE(InfoExtractor):
1740 """Information Extractor for YouTube channels."""
1741
1742 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1743 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1745 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1746 IE_NAME = u'youtube:channel'
1747
1748 def report_download_page(self, channel_id, pagenum):
1749 """Report attempt to download channel page with given number."""
1750 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1751
1752 def extract_videos_from_page(self, page):
1753 ids_in_page = []
1754 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1755 if mobj.group(1) not in ids_in_page:
1756 ids_in_page.append(mobj.group(1))
1757 return ids_in_page
1758
1759 def _real_extract(self, url):
1760 # Extract channel id
1761 mobj = re.match(self._VALID_URL, url)
1762 if mobj is None:
1763 self._downloader.report_error(u'invalid url: %s' % url)
1764 return
1765
1766 # Download channel page
1767 channel_id = mobj.group(1)
1768 video_ids = []
1769 pagenum = 1
1770
1771 self.report_download_page(channel_id, pagenum)
1772 url = self._TEMPLATE_URL % (channel_id, pagenum)
1773 request = compat_urllib_request.Request(url)
1774 try:
1775 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1778 return
1779
1780 # Extract video identifiers
1781 ids_in_page = self.extract_videos_from_page(page)
1782 video_ids.extend(ids_in_page)
1783
1784 # Download any subsequent channel pages using the json-based channel_ajax query
1785 if self._MORE_PAGES_INDICATOR in page:
1786 while True:
1787 pagenum = pagenum + 1
1788
1789 self.report_download_page(channel_id, pagenum)
1790 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1791 request = compat_urllib_request.Request(url)
1792 try:
1793 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1796 return
1797
1798 page = json.loads(page)
1799
1800 ids_in_page = self.extract_videos_from_page(page['content_html'])
1801 video_ids.extend(ids_in_page)
1802
1803 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1804 break
1805
1806 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1807
1808 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1809 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1810 return [self.playlist_result(url_entries, channel_id)]
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814 """Information Extractor for YouTube users."""
1815
1816 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818 _GDATA_PAGE_SIZE = 50
1819 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821 IE_NAME = u'youtube:user'
1822
1823 def report_download_page(self, username, start_index):
1824 """Report attempt to download user page."""
1825 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1826 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1827
1828 def _real_extract(self, url):
1829 # Extract username
1830 mobj = re.match(self._VALID_URL, url)
1831 if mobj is None:
1832 self._downloader.report_error(u'invalid url: %s' % url)
1833 return
1834
1835 username = mobj.group(1)
1836
1837 # Download video ids using YouTube Data API. Result size per
1838 # query is limited (currently to 50 videos) so we need to query
1839 # page by page until there are no video ids - it means we got
1840 # all of them.
1841
1842 video_ids = []
1843 pagenum = 0
1844
1845 while True:
1846 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1847 self.report_download_page(username, start_index)
1848
1849 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1850
1851 try:
1852 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1854 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1855 return
1856
1857 # Extract video identifiers
1858 ids_in_page = []
1859
1860 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1861 if mobj.group(1) not in ids_in_page:
1862 ids_in_page.append(mobj.group(1))
1863
1864 video_ids.extend(ids_in_page)
1865
1866 # A little optimization - if current page is not
1867 # "full", ie. does not contain PAGE_SIZE video ids then
1868 # we can assume that this page is the last one - there
1869 # are no more ids on further pages - no need to query
1870 # again.
1871
1872 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1873 break
1874
1875 pagenum += 1
1876
1877 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1878 url_results = [self.url_result(url, 'Youtube') for url in urls]
1879 return [self.playlist_result(url_results, playlist_title = username)]
1880
1881
1882 class BlipTVUserIE(InfoExtractor):
1883 """Information Extractor for blip.tv users."""
1884
1885 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1886 _PAGE_SIZE = 12
1887 IE_NAME = u'blip.tv:user'
1888
1889 def report_download_page(self, username, pagenum):
1890 """Report attempt to download user page."""
1891 self.to_screen(u'user %s: Downloading video ids from page %d' %
1892 (username, pagenum))
1893
1894 def _real_extract(self, url):
1895 # Extract username
1896 mobj = re.match(self._VALID_URL, url)
1897 if mobj is None:
1898 self._downloader.report_error(u'invalid url: %s' % url)
1899 return
1900
1901 username = mobj.group(1)
1902
1903 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1904
1905 request = compat_urllib_request.Request(url)
1906
1907 try:
1908 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909 mobj = re.search(r'data-users-id="([^"]+)"', page)
1910 page_base = page_base % mobj.group(1)
1911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1912 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1913 return
1914
1915
1916 # Download video ids using BlipTV Ajax calls. Result size per
1917 # query is limited (currently to 12 videos) so we need to query
1918 # page by page until there are no video ids - it means we got
1919 # all of them.
1920
1921 video_ids = []
1922 pagenum = 1
1923
1924 while True:
1925 self.report_download_page(username, pagenum)
1926 url = page_base + "&page=" + str(pagenum)
1927 request = compat_urllib_request.Request( url )
1928 try:
1929 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1931 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1932 return
1933
1934 # Extract video identifiers
1935 ids_in_page = []
1936
1937 for mobj in re.finditer(r'href="/([^"]+)"', page):
1938 if mobj.group(1) not in ids_in_page:
1939 ids_in_page.append(unescapeHTML(mobj.group(1)))
1940
1941 video_ids.extend(ids_in_page)
1942
1943 # A little optimization - if current page is not
1944 # "full", ie. does not contain PAGE_SIZE video ids then
1945 # we can assume that this page is the last one - there
1946 # are no more ids on further pages - no need to query
1947 # again.
1948
1949 if len(ids_in_page) < self._PAGE_SIZE:
1950 break
1951
1952 pagenum += 1
1953
1954 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1955 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1956 return [self.playlist_result(url_entries, playlist_title = username)]
1957
1958
1959 class DepositFilesIE(InfoExtractor):
1960 """Information extractor for depositfiles.com"""
1961
1962 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1963
1964 def _real_extract(self, url):
1965 file_id = url.split('/')[-1]
1966 # Rebuild url in english locale
1967 url = 'http://depositfiles.com/en/files/' + file_id
1968
1969 # Retrieve file webpage with 'Free download' button pressed
1970 free_download_indication = { 'gateway_result' : '1' }
1971 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1972 try:
1973 self.report_download_webpage(file_id)
1974 webpage = compat_urllib_request.urlopen(request).read()
1975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1976 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1977 return
1978
1979 # Search for the real file URL
1980 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1981 if (mobj is None) or (mobj.group(1) is None):
1982 # Try to figure out reason of the error.
1983 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1984 if (mobj is not None) and (mobj.group(1) is not None):
1985 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1986 self._downloader.report_error(u'%s' % restriction_message)
1987 else:
1988 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1989 return
1990
1991 file_url = mobj.group(1)
1992 file_extension = os.path.splitext(file_url)[1][1:]
1993
1994 # Search for file title
1995 mobj = re.search(r'<b title="(.*?)">', webpage)
1996 if mobj is None:
1997 self._downloader.report_error(u'unable to extract title')
1998 return
1999 file_title = mobj.group(1).decode('utf-8')
2000
2001 return [{
2002 'id': file_id.decode('utf-8'),
2003 'url': file_url.decode('utf-8'),
2004 'uploader': None,
2005 'upload_date': None,
2006 'title': file_title,
2007 'ext': file_extension.decode('utf-8'),
2008 }]
2009
2010
2011 class FacebookIE(InfoExtractor):
2012 """Information Extractor for Facebook"""
2013
2014 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2015 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2016 _NETRC_MACHINE = 'facebook'
2017 IE_NAME = u'facebook'
2018
2019 def report_login(self):
2020 """Report attempt to log in."""
2021 self.to_screen(u'Logging in')
2022
2023 def _real_initialize(self):
2024 if self._downloader is None:
2025 return
2026
2027 useremail = None
2028 password = None
2029 downloader_params = self._downloader.params
2030
2031 # Attempt to use provided username and password or .netrc data
2032 if downloader_params.get('username', None) is not None:
2033 useremail = downloader_params['username']
2034 password = downloader_params['password']
2035 elif downloader_params.get('usenetrc', False):
2036 try:
2037 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2038 if info is not None:
2039 useremail = info[0]
2040 password = info[2]
2041 else:
2042 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2043 except (IOError, netrc.NetrcParseError) as err:
2044 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2045 return
2046
2047 if useremail is None:
2048 return
2049
2050 # Log in
2051 login_form = {
2052 'email': useremail,
2053 'pass': password,
2054 'login': 'Log+In'
2055 }
2056 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2057 try:
2058 self.report_login()
2059 login_results = compat_urllib_request.urlopen(request).read()
2060 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2061 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2062 return
2063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2064 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2065 return
2066
2067 def _real_extract(self, url):
2068 mobj = re.match(self._VALID_URL, url)
2069 if mobj is None:
2070 self._downloader.report_error(u'invalid URL: %s' % url)
2071 return
2072 video_id = mobj.group('ID')
2073
2074 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2075 webpage = self._download_webpage(url, video_id)
2076
2077 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2078 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2079 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2080 if not m:
2081 raise ExtractorError(u'Cannot parse data')
2082 data = dict(json.loads(m.group(1)))
2083 params_raw = compat_urllib_parse.unquote(data['params'])
2084 params = json.loads(params_raw)
2085 video_data = params['video_data'][0]
2086 video_url = video_data.get('hd_src')
2087 if not video_url:
2088 video_url = video_data['sd_src']
2089 if not video_url:
2090 raise ExtractorError(u'Cannot find video URL')
2091 video_duration = int(video_data['video_duration'])
2092 thumbnail = video_data['thumbnail_src']
2093
2094 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2095 if not m:
2096 raise ExtractorError(u'Cannot find title in webpage')
2097 video_title = unescapeHTML(m.group(1))
2098
2099 info = {
2100 'id': video_id,
2101 'title': video_title,
2102 'url': video_url,
2103 'ext': 'mp4',
2104 'duration': video_duration,
2105 'thumbnail': thumbnail,
2106 }
2107 return [info]
2108
2109
2110 class BlipTVIE(InfoExtractor):
2111 """Information extractor for blip.tv"""
2112
2113 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2114 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2115 IE_NAME = u'blip.tv'
2116
2117 def report_direct_download(self, title):
2118 """Report information extraction."""
2119 self.to_screen(u'%s: Direct download detected' % title)
2120
2121 def _real_extract(self, url):
2122 mobj = re.match(self._VALID_URL, url)
2123 if mobj is None:
2124 self._downloader.report_error(u'invalid URL: %s' % url)
2125 return
2126
2127 urlp = compat_urllib_parse_urlparse(url)
2128 if urlp.path.startswith('/play/'):
2129 request = compat_urllib_request.Request(url)
2130 response = compat_urllib_request.urlopen(request)
2131 redirecturl = response.geturl()
2132 rurlp = compat_urllib_parse_urlparse(redirecturl)
2133 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2134 url = 'http://blip.tv/a/a-' + file_id
2135 return self._real_extract(url)
2136
2137
2138 if '?' in url:
2139 cchar = '&'
2140 else:
2141 cchar = '?'
2142 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2143 request = compat_urllib_request.Request(json_url)
2144 request.add_header('User-Agent', 'iTunes/10.6.1')
2145 self.report_extraction(mobj.group(1))
2146 info = None
2147 try:
2148 urlh = compat_urllib_request.urlopen(request)
2149 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2150 basename = url.split('/')[-1]
2151 title,ext = os.path.splitext(basename)
2152 title = title.decode('UTF-8')
2153 ext = ext.replace('.', '')
2154 self.report_direct_download(title)
2155 info = {
2156 'id': title,
2157 'url': url,
2158 'uploader': None,
2159 'upload_date': None,
2160 'title': title,
2161 'ext': ext,
2162 'urlhandle': urlh
2163 }
2164 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2165 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2166 if info is None: # Regular URL
2167 try:
2168 json_code_bytes = urlh.read()
2169 json_code = json_code_bytes.decode('utf-8')
2170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2172 return
2173
2174 try:
2175 json_data = json.loads(json_code)
2176 if 'Post' in json_data:
2177 data = json_data['Post']
2178 else:
2179 data = json_data
2180
2181 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2182 video_url = data['media']['url']
2183 umobj = re.match(self._URL_EXT, video_url)
2184 if umobj is None:
2185 raise ValueError('Can not determine filename extension')
2186 ext = umobj.group(1)
2187
2188 info = {
2189 'id': data['item_id'],
2190 'url': video_url,
2191 'uploader': data['display_name'],
2192 'upload_date': upload_date,
2193 'title': data['title'],
2194 'ext': ext,
2195 'format': data['media']['mimeType'],
2196 'thumbnail': data['thumbnailUrl'],
2197 'description': data['description'],
2198 'player_url': data['embedUrl'],
2199 'user_agent': 'iTunes/10.6.1',
2200 }
2201 except (ValueError,KeyError) as err:
2202 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2203 return
2204
2205 return [info]
2206
2207
2208 class MyVideoIE(InfoExtractor):
2209 """Information Extractor for myvideo.de."""
2210
2211 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2212 IE_NAME = u'myvideo'
2213
2214 def _real_extract(self,url):
2215 mobj = re.match(self._VALID_URL, url)
2216 if mobj is None:
2217 self._download.report_error(u'invalid URL: %s' % url)
2218 return
2219
2220 video_id = mobj.group(1)
2221
2222 # Get video webpage
2223 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2224 webpage = self._download_webpage(webpage_url, video_id)
2225
2226 self.report_extraction(video_id)
2227 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2228 webpage)
2229 if mobj is None:
2230 self._downloader.report_error(u'unable to extract media URL')
2231 return
2232 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2233
2234 mobj = re.search('<title>([^<]+)</title>', webpage)
2235 if mobj is None:
2236 self._downloader.report_error(u'unable to extract title')
2237 return
2238
2239 video_title = mobj.group(1)
2240
2241 return [{
2242 'id': video_id,
2243 'url': video_url,
2244 'uploader': None,
2245 'upload_date': None,
2246 'title': video_title,
2247 'ext': u'flv',
2248 }]
2249
2250 class ComedyCentralIE(InfoExtractor):
2251 """Information extractor for The Daily Show and Colbert Report """
2252
2253 # urls can be abbreviations like :thedailyshow or :colbert
2254 # urls for episodes like:
2255 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259 |(https?://)?(www\.)?
2260 (?P<showname>thedailyshow|colbertnation)\.com/
2261 (full-episodes/(?P<episode>.*)|
2262 (?P<clip>
2263 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2265 $"""
2266
2267 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2268
2269 _video_extensions = {
2270 '3500': 'mp4',
2271 '2200': 'mp4',
2272 '1700': 'mp4',
2273 '1200': 'mp4',
2274 '750': 'mp4',
2275 '400': 'mp4',
2276 }
2277 _video_dimensions = {
2278 '3500': '1280x720',
2279 '2200': '960x540',
2280 '1700': '768x432',
2281 '1200': '640x360',
2282 '750': '512x288',
2283 '400': '384x216',
2284 }
2285
2286 @classmethod
2287 def suitable(cls, url):
2288 """Receives a URL and returns True if suitable for this IE."""
2289 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2290
2291 def report_config_download(self, episode_id, media_id):
2292 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2293
2294 def report_index_download(self, episode_id):
2295 self.to_screen(u'%s: Downloading show index' % episode_id)
2296
2297 def _print_formats(self, formats):
2298 print('Available formats:')
2299 for x in formats:
2300 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2301
2302
2303 def _real_extract(self, url):
2304 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2305 if mobj is None:
2306 self._downloader.report_error(u'invalid URL: %s' % url)
2307 return
2308
2309 if mobj.group('shortname'):
2310 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2311 url = u'http://www.thedailyshow.com/full-episodes/'
2312 else:
2313 url = u'http://www.colbertnation.com/full-episodes/'
2314 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2315 assert mobj is not None
2316
2317 if mobj.group('clip'):
2318 if mobj.group('showname') == 'thedailyshow':
2319 epTitle = mobj.group('tdstitle')
2320 else:
2321 epTitle = mobj.group('cntitle')
2322 dlNewest = False
2323 else:
2324 dlNewest = not mobj.group('episode')
2325 if dlNewest:
2326 epTitle = mobj.group('showname')
2327 else:
2328 epTitle = mobj.group('episode')
2329
2330 req = compat_urllib_request.Request(url)
2331 self.report_extraction(epTitle)
2332 try:
2333 htmlHandle = compat_urllib_request.urlopen(req)
2334 html = htmlHandle.read()
2335 webpage = html.decode('utf-8')
2336 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2337 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2338 return
2339 if dlNewest:
2340 url = htmlHandle.geturl()
2341 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2342 if mobj is None:
2343 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2344 return
2345 if mobj.group('episode') == '':
2346 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2347 return
2348 epTitle = mobj.group('episode')
2349
2350 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2351
2352 if len(mMovieParams) == 0:
2353 # The Colbert Report embeds the information in a without
2354 # a URL prefix; so extract the alternate reference
2355 # and then add the URL prefix manually.
2356
2357 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2358 if len(altMovieParams) == 0:
2359 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2360 return
2361 else:
2362 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2363
2364 uri = mMovieParams[0][1]
2365 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2366 self.report_index_download(epTitle)
2367 try:
2368 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2369 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2370 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2371 return
2372
2373 results = []
2374
2375 idoc = xml.etree.ElementTree.fromstring(indexXml)
2376 itemEls = idoc.findall('.//item')
2377 for partNum,itemEl in enumerate(itemEls):
2378 mediaId = itemEl.findall('./guid')[0].text
2379 shortMediaId = mediaId.split(':')[-1]
2380 showId = mediaId.split(':')[-2].replace('.com', '')
2381 officialTitle = itemEl.findall('./title')[0].text
2382 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2383
2384 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2385 compat_urllib_parse.urlencode({'uri': mediaId}))
2386 configReq = compat_urllib_request.Request(configUrl)
2387 self.report_config_download(epTitle, shortMediaId)
2388 try:
2389 configXml = compat_urllib_request.urlopen(configReq).read()
2390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2392 return
2393
2394 cdoc = xml.etree.ElementTree.fromstring(configXml)
2395 turls = []
2396 for rendition in cdoc.findall('.//rendition'):
2397 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2398 turls.append(finfo)
2399
2400 if len(turls) == 0:
2401 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2402 continue
2403
2404 if self._downloader.params.get('listformats', None):
2405 self._print_formats([i[0] for i in turls])
2406 return
2407
2408 # For now, just pick the highest bitrate
2409 format,rtmp_video_url = turls[-1]
2410
2411 # Get the format arg from the arg stream
2412 req_format = self._downloader.params.get('format', None)
2413
2414 # Select format if we can find one
2415 for f,v in turls:
2416 if f == req_format:
2417 format, rtmp_video_url = f, v
2418 break
2419
2420 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2421 if not m:
2422 raise ExtractorError(u'Cannot transform RTMP url')
2423 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2424 video_url = base + m.group('finalid')
2425
2426 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2427 info = {
2428 'id': shortMediaId,
2429 'url': video_url,
2430 'uploader': showId,
2431 'upload_date': officialDate,
2432 'title': effTitle,
2433 'ext': 'mp4',
2434 'format': format,
2435 'thumbnail': None,
2436 'description': officialTitle,
2437 }
2438 results.append(info)
2439
2440 return results
2441
2442
2443 class EscapistIE(InfoExtractor):
2444 """Information extractor for The Escapist """
2445
2446 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2447 IE_NAME = u'escapist'
2448
2449 def report_config_download(self, showName):
2450 self.to_screen(u'%s: Downloading configuration' % showName)
2451
2452 def _real_extract(self, url):
2453 mobj = re.match(self._VALID_URL, url)
2454 if mobj is None:
2455 self._downloader.report_error(u'invalid URL: %s' % url)
2456 return
2457 showName = mobj.group('showname')
2458 videoId = mobj.group('episode')
2459
2460 self.report_extraction(showName)
2461 try:
2462 webPage = compat_urllib_request.urlopen(url)
2463 webPageBytes = webPage.read()
2464 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2465 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2468 return
2469
2470 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2471 description = unescapeHTML(descMatch.group(1))
2472 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2473 imgUrl = unescapeHTML(imgMatch.group(1))
2474 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2475 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2476 configUrlMatch = re.search('config=(.*)$', playerUrl)
2477 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2478
2479 self.report_config_download(showName)
2480 try:
2481 configJSON = compat_urllib_request.urlopen(configUrl)
2482 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2483 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2486 return
2487
2488 # Technically, it's JavaScript, not JSON
2489 configJSON = configJSON.replace("'", '"')
2490
2491 try:
2492 config = json.loads(configJSON)
2493 except (ValueError,) as err:
2494 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2495 return
2496
2497 playlist = config['playlist']
2498 videoUrl = playlist[1]['url']
2499
2500 info = {
2501 'id': videoId,
2502 'url': videoUrl,
2503 'uploader': showName,
2504 'upload_date': None,
2505 'title': showName,
2506 'ext': 'mp4',
2507 'thumbnail': imgUrl,
2508 'description': description,
2509 'player_url': playerUrl,
2510 }
2511
2512 return [info]
2513
2514 class CollegeHumorIE(InfoExtractor):
2515 """Information extractor for collegehumor.com"""
2516
2517 _WORKING = False
2518 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2519 IE_NAME = u'collegehumor'
2520
2521 def report_manifest(self, video_id):
2522 """Report information extraction."""
2523 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2524
2525 def _real_extract(self, url):
2526 mobj = re.match(self._VALID_URL, url)
2527 if mobj is None:
2528 self._downloader.report_error(u'invalid URL: %s' % url)
2529 return
2530 video_id = mobj.group('videoid')
2531
2532 info = {
2533 'id': video_id,
2534 'uploader': None,
2535 'upload_date': None,
2536 }
2537
2538 self.report_extraction(video_id)
2539 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2540 try:
2541 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2543 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2544 return
2545
2546 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2547 try:
2548 videoNode = mdoc.findall('./video')[0]
2549 info['description'] = videoNode.findall('./description')[0].text
2550 info['title'] = videoNode.findall('./caption')[0].text
2551 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2552 manifest_url = videoNode.findall('./file')[0].text
2553 except IndexError:
2554 self._downloader.report_error(u'Invalid metadata XML file')
2555 return
2556
2557 manifest_url += '?hdcore=2.10.3'
2558 self.report_manifest(video_id)
2559 try:
2560 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2562 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2563 return
2564
2565 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2566 try:
2567 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2568 node_id = media_node.attrib['url']
2569 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2570 except IndexError as err:
2571 self._downloader.report_error(u'Invalid manifest file')
2572 return
2573
2574 url_pr = compat_urllib_parse_urlparse(manifest_url)
2575 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2576
2577 info['url'] = url
2578 info['ext'] = 'f4f'
2579 return [info]
2580
2581
2582 class XVideosIE(InfoExtractor):
2583 """Information extractor for xvideos.com"""
2584
2585 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2586 IE_NAME = u'xvideos'
2587
2588 def _real_extract(self, url):
2589 mobj = re.match(self._VALID_URL, url)
2590 if mobj is None:
2591 self._downloader.report_error(u'invalid URL: %s' % url)
2592 return
2593 video_id = mobj.group(1)
2594
2595 webpage = self._download_webpage(url, video_id)
2596
2597 self.report_extraction(video_id)
2598
2599
2600 # Extract video URL
2601 mobj = re.search(r'flv_url=(.+?)&', webpage)
2602 if mobj is None:
2603 self._downloader.report_error(u'unable to extract video url')
2604 return
2605 video_url = compat_urllib_parse.unquote(mobj.group(1))
2606
2607
2608 # Extract title
2609 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2610 if mobj is None:
2611 self._downloader.report_error(u'unable to extract video title')
2612 return
2613 video_title = mobj.group(1)
2614
2615
2616 # Extract video thumbnail
2617 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2618 if mobj is None:
2619 self._downloader.report_error(u'unable to extract video thumbnail')
2620 return
2621 video_thumbnail = mobj.group(0)
2622
2623 info = {
2624 'id': video_id,
2625 'url': video_url,
2626 'uploader': None,
2627 'upload_date': None,
2628 'title': video_title,
2629 'ext': 'flv',
2630 'thumbnail': video_thumbnail,
2631 'description': None,
2632 }
2633
2634 return [info]
2635
2636
2637 class SoundcloudIE(InfoExtractor):
2638 """Information extractor for soundcloud.com
2639 To access the media, the uid of the song and a stream token
2640 must be extracted from the page source and the script must make
2641 a request to media.soundcloud.com/crossdomain.xml. Then
2642 the media can be grabbed by requesting from an url composed
2643 of the stream token and uid
2644 """
2645
2646 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2647 IE_NAME = u'soundcloud'
2648
2649 def report_resolve(self, video_id):
2650 """Report information extraction."""
2651 self.to_screen(u'%s: Resolving id' % video_id)
2652
2653 def _real_extract(self, url):
2654 mobj = re.match(self._VALID_URL, url)
2655 if mobj is None:
2656 self._downloader.report_error(u'invalid URL: %s' % url)
2657 return
2658
2659 # extract uploader (which is in the url)
2660 uploader = mobj.group(1)
2661 # extract simple title (uploader + slug of song title)
2662 slug_title = mobj.group(2)
2663 simple_title = uploader + u'-' + slug_title
2664
2665 self.report_resolve('%s/%s' % (uploader, slug_title))
2666
2667 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2668 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2669 request = compat_urllib_request.Request(resolv_url)
2670 try:
2671 info_json_bytes = compat_urllib_request.urlopen(request).read()
2672 info_json = info_json_bytes.decode('utf-8')
2673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2675 return
2676
2677 info = json.loads(info_json)
2678 video_id = info['id']
2679 self.report_extraction('%s/%s' % (uploader, slug_title))
2680
2681 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2682 request = compat_urllib_request.Request(streams_url)
2683 try:
2684 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2685 stream_json = stream_json_bytes.decode('utf-8')
2686 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2687 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2688 return
2689
2690 streams = json.loads(stream_json)
2691 mediaURL = streams['http_mp3_128_url']
2692 upload_date = unified_strdate(info['created_at'])
2693
2694 return [{
2695 'id': info['id'],
2696 'url': mediaURL,
2697 'uploader': info['user']['username'],
2698 'upload_date': upload_date,
2699 'title': info['title'],
2700 'ext': u'mp3',
2701 'description': info['description'],
2702 }]
2703
2704 class SoundcloudSetIE(InfoExtractor):
2705 """Information extractor for soundcloud.com sets
2706 To access the media, the uid of the song and a stream token
2707 must be extracted from the page source and the script must make
2708 a request to media.soundcloud.com/crossdomain.xml. Then
2709 the media can be grabbed by requesting from an url composed
2710 of the stream token and uid
2711 """
2712
2713 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2714 IE_NAME = u'soundcloud:set'
2715
2716 def report_resolve(self, video_id):
2717 """Report information extraction."""
2718 self.to_screen(u'%s: Resolving id' % video_id)
2719
2720 def _real_extract(self, url):
2721 mobj = re.match(self._VALID_URL, url)
2722 if mobj is None:
2723 self._downloader.report_error(u'invalid URL: %s' % url)
2724 return
2725
2726 # extract uploader (which is in the url)
2727 uploader = mobj.group(1)
2728 # extract simple title (uploader + slug of song title)
2729 slug_title = mobj.group(2)
2730 simple_title = uploader + u'-' + slug_title
2731
2732 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2733
2734 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2735 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736 request = compat_urllib_request.Request(resolv_url)
2737 try:
2738 info_json_bytes = compat_urllib_request.urlopen(request).read()
2739 info_json = info_json_bytes.decode('utf-8')
2740 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2742 return
2743
2744 videos = []
2745 info = json.loads(info_json)
2746 if 'errors' in info:
2747 for err in info['errors']:
2748 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2749 return
2750
2751 for track in info['tracks']:
2752 video_id = track['id']
2753 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2754
2755 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2756 request = compat_urllib_request.Request(streams_url)
2757 try:
2758 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2759 stream_json = stream_json_bytes.decode('utf-8')
2760 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2761 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2762 return
2763
2764 streams = json.loads(stream_json)
2765 mediaURL = streams['http_mp3_128_url']
2766
2767 videos.append({
2768 'id': video_id,
2769 'url': mediaURL,
2770 'uploader': track['user']['username'],
2771 'upload_date': unified_strdate(track['created_at']),
2772 'title': track['title'],
2773 'ext': u'mp3',
2774 'description': track['description'],
2775 })
2776 return videos
2777
2778
2779 class InfoQIE(InfoExtractor):
2780 """Information extractor for infoq.com"""
2781 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2782
2783 def _real_extract(self, url):
2784 mobj = re.match(self._VALID_URL, url)
2785 if mobj is None:
2786 self._downloader.report_error(u'invalid URL: %s' % url)
2787 return
2788
2789 webpage = self._download_webpage(url, video_id=url)
2790 self.report_extraction(url)
2791
2792 # Extract video URL
2793 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2794 if mobj is None:
2795 self._downloader.report_error(u'unable to extract video url')
2796 return
2797 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2798 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2799
2800 # Extract title
2801 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2802 if mobj is None:
2803 self._downloader.report_error(u'unable to extract video title')
2804 return
2805 video_title = mobj.group(1)
2806
2807 # Extract description
2808 video_description = u'No description available.'
2809 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2810 if mobj is not None:
2811 video_description = mobj.group(1)
2812
2813 video_filename = video_url.split('/')[-1]
2814 video_id, extension = video_filename.split('.')
2815
2816 info = {
2817 'id': video_id,
2818 'url': video_url,
2819 'uploader': None,
2820 'upload_date': None,
2821 'title': video_title,
2822 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2823 'thumbnail': None,
2824 'description': video_description,
2825 }
2826
2827 return [info]
2828
2829 class MixcloudIE(InfoExtractor):
2830 """Information extractor for www.mixcloud.com"""
2831
2832 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2833 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834 IE_NAME = u'mixcloud'
2835
2836 def report_download_json(self, file_id):
2837 """Report JSON download."""
2838 self.to_screen(u'Downloading json')
2839
2840 def get_urls(self, jsonData, fmt, bitrate='best'):
2841 """Get urls from 'audio_formats' section in json"""
2842 file_url = None
2843 try:
2844 bitrate_list = jsonData[fmt]
2845 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2846 bitrate = max(bitrate_list) # select highest
2847
2848 url_list = jsonData[fmt][bitrate]
2849 except TypeError: # we have no bitrate info.
2850 url_list = jsonData[fmt]
2851 return url_list
2852
2853 def check_urls(self, url_list):
2854 """Returns 1st active url from list"""
2855 for url in url_list:
2856 try:
2857 compat_urllib_request.urlopen(url)
2858 return url
2859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2860 url = None
2861
2862 return None
2863
2864 def _print_formats(self, formats):
2865 print('Available formats:')
2866 for fmt in formats.keys():
2867 for b in formats[fmt]:
2868 try:
2869 ext = formats[fmt][b][0]
2870 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2871 except TypeError: # we have no bitrate info
2872 ext = formats[fmt][0]
2873 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2874 break
2875
2876 def _real_extract(self, url):
2877 mobj = re.match(self._VALID_URL, url)
2878 if mobj is None:
2879 self._downloader.report_error(u'invalid URL: %s' % url)
2880 return
2881 # extract uploader & filename from url
2882 uploader = mobj.group(1).decode('utf-8')
2883 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2884
2885 # construct API request
2886 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2887 # retrieve .json file with links to files
2888 request = compat_urllib_request.Request(file_url)
2889 try:
2890 self.report_download_json(file_url)
2891 jsonData = compat_urllib_request.urlopen(request).read()
2892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2894 return
2895
2896 # parse JSON
2897 json_data = json.loads(jsonData)
2898 player_url = json_data['player_swf_url']
2899 formats = dict(json_data['audio_formats'])
2900
2901 req_format = self._downloader.params.get('format', None)
2902 bitrate = None
2903
2904 if self._downloader.params.get('listformats', None):
2905 self._print_formats(formats)
2906 return
2907
2908 if req_format is None or req_format == 'best':
2909 for format_param in formats.keys():
2910 url_list = self.get_urls(formats, format_param)
2911 # check urls
2912 file_url = self.check_urls(url_list)
2913 if file_url is not None:
2914 break # got it!
2915 else:
2916 if req_format not in formats:
2917 self._downloader.report_error(u'format is not available')
2918 return
2919
2920 url_list = self.get_urls(formats, req_format)
2921 file_url = self.check_urls(url_list)
2922 format_param = req_format
2923
2924 return [{
2925 'id': file_id.decode('utf-8'),
2926 'url': file_url.decode('utf-8'),
2927 'uploader': uploader.decode('utf-8'),
2928 'upload_date': None,
2929 'title': json_data['name'],
2930 'ext': file_url.split('.')[-1].decode('utf-8'),
2931 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2932 'thumbnail': json_data['thumbnail_url'],
2933 'description': json_data['description'],
2934 'player_url': player_url.decode('utf-8'),
2935 }]
2936
2937 class StanfordOpenClassroomIE(InfoExtractor):
2938 """Information extractor for Stanford's Open ClassRoom"""
2939
2940 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2941 IE_NAME = u'stanfordoc'
2942
2943 def _real_extract(self, url):
2944 mobj = re.match(self._VALID_URL, url)
2945 if mobj is None:
2946 raise ExtractorError(u'Invalid URL: %s' % url)
2947
2948 if mobj.group('course') and mobj.group('video'): # A specific video
2949 course = mobj.group('course')
2950 video = mobj.group('video')
2951 info = {
2952 'id': course + '_' + video,
2953 'uploader': None,
2954 'upload_date': None,
2955 }
2956
2957 self.report_extraction(info['id'])
2958 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2959 xmlUrl = baseUrl + video + '.xml'
2960 try:
2961 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2962 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2963 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2964 return
2965 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2966 try:
2967 info['title'] = mdoc.findall('./title')[0].text
2968 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2969 except IndexError:
2970 self._downloader.report_error(u'Invalid metadata XML file')
2971 return
2972 info['ext'] = info['url'].rpartition('.')[2]
2973 return [info]
2974 elif mobj.group('course'): # A course page
2975 course = mobj.group('course')
2976 info = {
2977 'id': course,
2978 'type': 'playlist',
2979 'uploader': None,
2980 'upload_date': None,
2981 }
2982
2983 coursepage = self._download_webpage(url, info['id'],
2984 note='Downloading course info page',
2985 errnote='Unable to download course info page')
2986
2987 m = re.search('<h1>([^<]+)</h1>', coursepage)
2988 if m:
2989 info['title'] = unescapeHTML(m.group(1))
2990 else:
2991 info['title'] = info['id']
2992
2993 m = re.search('<description>([^<]+)</description>', coursepage)
2994 if m:
2995 info['description'] = unescapeHTML(m.group(1))
2996
2997 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2998 info['list'] = [
2999 {
3000 'type': 'reference',
3001 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3002 }
3003 for vpage in links]
3004 results = []
3005 for entry in info['list']:
3006 assert entry['type'] == 'reference'
3007 results += self.extract(entry['url'])
3008 return results
3009 else: # Root page
3010 info = {
3011 'id': 'Stanford OpenClassroom',
3012 'type': 'playlist',
3013 'uploader': None,
3014 'upload_date': None,
3015 }
3016
3017 self.report_download_webpage(info['id'])
3018 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3019 try:
3020 rootpage = compat_urllib_request.urlopen(rootURL).read()
3021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3023 return
3024
3025 info['title'] = info['id']
3026
3027 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3028 info['list'] = [
3029 {
3030 'type': 'reference',
3031 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3032 }
3033 for cpage in links]
3034
3035 results = []
3036 for entry in info['list']:
3037 assert entry['type'] == 'reference'
3038 results += self.extract(entry['url'])
3039 return results
3040
3041 class MTVIE(InfoExtractor):
3042 """Information extractor for MTV.com"""
3043
3044 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3045 IE_NAME = u'mtv'
3046
3047 def _real_extract(self, url):
3048 mobj = re.match(self._VALID_URL, url)
3049 if mobj is None:
3050 self._downloader.report_error(u'invalid URL: %s' % url)
3051 return
3052 if not mobj.group('proto'):
3053 url = 'http://' + url
3054 video_id = mobj.group('videoid')
3055
3056 webpage = self._download_webpage(url, video_id)
3057
3058 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3059 if mobj is None:
3060 self._downloader.report_error(u'unable to extract song name')
3061 return
3062 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3063 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3064 if mobj is None:
3065 self._downloader.report_error(u'unable to extract performer')
3066 return
3067 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068 video_title = performer + ' - ' + song_name
3069
3070 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3071 if mobj is None:
3072 self._downloader.report_error(u'unable to mtvn_uri')
3073 return
3074 mtvn_uri = mobj.group(1)
3075
3076 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3077 if mobj is None:
3078 self._downloader.report_error(u'unable to extract content id')
3079 return
3080 content_id = mobj.group(1)
3081
3082 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3083 self.report_extraction(video_id)
3084 request = compat_urllib_request.Request(videogen_url)
3085 try:
3086 metadataXml = compat_urllib_request.urlopen(request).read()
3087 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3089 return
3090
3091 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3092 renditions = mdoc.findall('.//rendition')
3093
3094 # For now, always pick the highest quality.
3095 rendition = renditions[-1]
3096
3097 try:
3098 _,_,ext = rendition.attrib['type'].partition('/')
3099 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3100 video_url = rendition.find('./src').text
3101 except KeyError:
3102 self._downloader.report_error('Invalid rendition field.')
3103 return
3104
3105 info = {
3106 'id': video_id,
3107 'url': video_url,
3108 'uploader': performer,
3109 'upload_date': None,
3110 'title': video_title,
3111 'ext': ext,
3112 'format': format,
3113 }
3114
3115 return [info]
3116
3117
3118 class YoukuIE(InfoExtractor):
3119 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3120
3121 def _gen_sid(self):
3122 nowTime = int(time.time() * 1000)
3123 random1 = random.randint(1000,1998)
3124 random2 = random.randint(1000,9999)
3125
3126 return "%d%d%d" %(nowTime,random1,random2)
3127
3128 def _get_file_ID_mix_string(self, seed):
3129 mixed = []
3130 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3131 seed = float(seed)
3132 for i in range(len(source)):
3133 seed = (seed * 211 + 30031 ) % 65536
3134 index = math.floor(seed / 65536 * len(source) )
3135 mixed.append(source[int(index)])
3136 source.remove(source[int(index)])
3137 #return ''.join(mixed)
3138 return mixed
3139
3140 def _get_file_id(self, fileId, seed):
3141 mixed = self._get_file_ID_mix_string(seed)
3142 ids = fileId.split('*')
3143 realId = []
3144 for ch in ids:
3145 if ch:
3146 realId.append(mixed[int(ch)])
3147 return ''.join(realId)
3148
3149 def _real_extract(self, url):
3150 mobj = re.match(self._VALID_URL, url)
3151 if mobj is None:
3152 self._downloader.report_error(u'invalid URL: %s' % url)
3153 return
3154 video_id = mobj.group('ID')
3155
3156 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3157
3158 request = compat_urllib_request.Request(info_url, None, std_headers)
3159 try:
3160 self.report_download_webpage(video_id)
3161 jsondata = compat_urllib_request.urlopen(request).read()
3162 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3163 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3164 return
3165
3166 self.report_extraction(video_id)
3167 try:
3168 jsonstr = jsondata.decode('utf-8')
3169 config = json.loads(jsonstr)
3170
3171 video_title = config['data'][0]['title']
3172 seed = config['data'][0]['seed']
3173
3174 format = self._downloader.params.get('format', None)
3175 supported_format = list(config['data'][0]['streamfileids'].keys())
3176
3177 if format is None or format == 'best':
3178 if 'hd2' in supported_format:
3179 format = 'hd2'
3180 else:
3181 format = 'flv'
3182 ext = u'flv'
3183 elif format == 'worst':
3184 format = 'mp4'
3185 ext = u'mp4'
3186 else:
3187 format = 'flv'
3188 ext = u'flv'
3189
3190
3191 fileid = config['data'][0]['streamfileids'][format]
3192 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3193 except (UnicodeDecodeError, ValueError, KeyError):
3194 self._downloader.report_error(u'unable to extract info section')
3195 return
3196
3197 files_info=[]
3198 sid = self._gen_sid()
3199 fileid = self._get_file_id(fileid, seed)
3200
3201 #column 8,9 of fileid represent the segment number
3202 #fileid[7:9] should be changed
3203 for index, key in enumerate(keys):
3204
3205 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3206 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3207
3208 info = {
3209 'id': '%s_part%02d' % (video_id, index),
3210 'url': download_url,
3211 'uploader': None,
3212 'upload_date': None,
3213 'title': video_title,
3214 'ext': ext,
3215 }
3216 files_info.append(info)
3217
3218 return files_info
3219
3220
3221 class XNXXIE(InfoExtractor):
3222 """Information extractor for xnxx.com"""
3223
3224 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3225 IE_NAME = u'xnxx'
3226 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3227 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3228 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3229
3230 def _real_extract(self, url):
3231 mobj = re.match(self._VALID_URL, url)
3232 if mobj is None:
3233 self._downloader.report_error(u'invalid URL: %s' % url)
3234 return
3235 video_id = mobj.group(1)
3236
3237 self.report_download_webpage(video_id)
3238
3239 # Get webpage content
3240 try:
3241 webpage_bytes = compat_urllib_request.urlopen(url).read()
3242 webpage = webpage_bytes.decode('utf-8')
3243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3245 return
3246
3247 result = re.search(self.VIDEO_URL_RE, webpage)
3248 if result is None:
3249 self._downloader.report_error(u'unable to extract video url')
3250 return
3251 video_url = compat_urllib_parse.unquote(result.group(1))
3252
3253 result = re.search(self.VIDEO_TITLE_RE, webpage)
3254 if result is None:
3255 self._downloader.report_error(u'unable to extract video title')
3256 return
3257 video_title = result.group(1)
3258
3259 result = re.search(self.VIDEO_THUMB_RE, webpage)
3260 if result is None:
3261 self._downloader.report_error(u'unable to extract video thumbnail')
3262 return
3263 video_thumbnail = result.group(1)
3264
3265 return [{
3266 'id': video_id,
3267 'url': video_url,
3268 'uploader': None,
3269 'upload_date': None,
3270 'title': video_title,
3271 'ext': 'flv',
3272 'thumbnail': video_thumbnail,
3273 'description': None,
3274 }]
3275
3276
3277 class GooglePlusIE(InfoExtractor):
3278 """Information extractor for plus.google.com."""
3279
3280 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3281 IE_NAME = u'plus.google'
3282
3283 def report_extract_entry(self, url):
3284 """Report downloading extry"""
3285 self.to_screen(u'Downloading entry: %s' % url)
3286
3287 def report_date(self, upload_date):
3288 """Report downloading extry"""
3289 self.to_screen(u'Entry date: %s' % upload_date)
3290
3291 def report_uploader(self, uploader):
3292 """Report downloading extry"""
3293 self.to_screen(u'Uploader: %s' % uploader)
3294
3295 def report_title(self, video_title):
3296 """Report downloading extry"""
3297 self.to_screen(u'Title: %s' % video_title)
3298
3299 def report_extract_vid_page(self, video_page):
3300 """Report information extraction."""
3301 self.to_screen(u'Extracting video page: %s' % video_page)
3302
3303 def _real_extract(self, url):
3304 # Extract id from URL
3305 mobj = re.match(self._VALID_URL, url)
3306 if mobj is None:
3307 self._downloader.report_error(u'Invalid URL: %s' % url)
3308 return
3309
3310 post_url = mobj.group(0)
3311 video_id = mobj.group(1)
3312
3313 video_extension = 'flv'
3314
3315 # Step 1, Retrieve post webpage to extract further information
3316 self.report_extract_entry(post_url)
3317 request = compat_urllib_request.Request(post_url)
3318 try:
3319 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3320 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3322 return
3323
3324 # Extract update date
3325 upload_date = None
3326 pattern = 'title="Timestamp">(.*?)</a>'
3327 mobj = re.search(pattern, webpage)
3328 if mobj:
3329 upload_date = mobj.group(1)
3330 # Convert timestring to a format suitable for filename
3331 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3332 upload_date = upload_date.strftime('%Y%m%d')
3333 self.report_date(upload_date)
3334
3335 # Extract uploader
3336 uploader = None
3337 pattern = r'rel\="author".*?>(.*?)</a>'
3338 mobj = re.search(pattern, webpage)
3339 if mobj:
3340 uploader = mobj.group(1)
3341 self.report_uploader(uploader)
3342
3343 # Extract title
3344 # Get the first line for title
3345 video_title = u'NA'
3346 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3347 mobj = re.search(pattern, webpage)
3348 if mobj:
3349 video_title = mobj.group(1)
3350 self.report_title(video_title)
3351
3352 # Step 2, Stimulate clicking the image box to launch video
3353 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3354 mobj = re.search(pattern, webpage)
3355 if mobj is None:
3356 self._downloader.report_error(u'unable to extract video page URL')
3357
3358 video_page = mobj.group(1)
3359 request = compat_urllib_request.Request(video_page)
3360 try:
3361 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3362 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3363 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3364 return
3365 self.report_extract_vid_page(video_page)
3366
3367
3368 # Extract video links on video page
3369 """Extract video links of all sizes"""
3370 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3371 mobj = re.findall(pattern, webpage)
3372 if len(mobj) == 0:
3373 self._downloader.report_error(u'unable to extract video links')
3374
3375 # Sort in resolution
3376 links = sorted(mobj)
3377
3378 # Choose the lowest of the sort, i.e. highest resolution
3379 video_url = links[-1]
3380 # Only get the url. The resolution part in the tuple has no use anymore
3381 video_url = video_url[-1]
3382 # Treat escaped \u0026 style hex
3383 try:
3384 video_url = video_url.decode("unicode_escape")
3385 except AttributeError: # Python 3
3386 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3387
3388
3389 return [{
3390 'id': video_id,
3391 'url': video_url,
3392 'uploader': uploader,
3393 'upload_date': upload_date,
3394 'title': video_title,
3395 'ext': video_extension,
3396 }]
3397
3398 class NBAIE(InfoExtractor):
3399 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3400 IE_NAME = u'nba'
3401
3402 def _real_extract(self, url):
3403 mobj = re.match(self._VALID_URL, url)
3404 if mobj is None:
3405 self._downloader.report_error(u'invalid URL: %s' % url)
3406 return
3407
3408 video_id = mobj.group(1)
3409 if video_id.endswith('/index.html'):
3410 video_id = video_id[:-len('/index.html')]
3411
3412 webpage = self._download_webpage(url, video_id)
3413
3414 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3415 def _findProp(rexp, default=None):
3416 m = re.search(rexp, webpage)
3417 if m:
3418 return unescapeHTML(m.group(1))
3419 else:
3420 return default
3421
3422 shortened_video_id = video_id.rpartition('/')[2]
3423 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3424 info = {
3425 'id': shortened_video_id,
3426 'url': video_url,
3427 'ext': 'mp4',
3428 'title': title,
3429 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3430 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3431 }
3432 return [info]
3433
3434 class JustinTVIE(InfoExtractor):
3435 """Information extractor for justin.tv and twitch.tv"""
3436 # TODO: One broadcast may be split into multiple videos. The key
3437 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3438 # starts at 1 and increases. Can we treat all parts as one video?
3439
3440 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3441 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3442 _JUSTIN_PAGE_LIMIT = 100
3443 IE_NAME = u'justin.tv'
3444
3445 def report_download_page(self, channel, offset):
3446 """Report attempt to download a single page of videos."""
3447 self.to_screen(u'%s: Downloading video information from %d to %d' %
3448 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3449
3450 # Return count of items, list of *valid* items
3451 def _parse_page(self, url):
3452 try:
3453 urlh = compat_urllib_request.urlopen(url)
3454 webpage_bytes = urlh.read()
3455 webpage = webpage_bytes.decode('utf-8', 'ignore')
3456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3457 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3458 return
3459
3460 response = json.loads(webpage)
3461 if type(response) != list:
3462 error_text = response.get('error', 'unknown error')
3463 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3464 return
3465 info = []
3466 for clip in response:
3467 video_url = clip['video_file_url']
3468 if video_url:
3469 video_extension = os.path.splitext(video_url)[1][1:]
3470 video_date = re.sub('-', '', clip['start_time'][:10])
3471 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3472 video_id = clip['id']
3473 video_title = clip.get('title', video_id)
3474 info.append({
3475 'id': video_id,
3476 'url': video_url,
3477 'title': video_title,
3478 'uploader': clip.get('channel_name', video_uploader_id),
3479 'uploader_id': video_uploader_id,
3480 'upload_date': video_date,
3481 'ext': video_extension,
3482 })
3483 return (len(response), info)
3484
3485 def _real_extract(self, url):
3486 mobj = re.match(self._VALID_URL, url)
3487 if mobj is None:
3488 self._downloader.report_error(u'invalid URL: %s' % url)
3489 return
3490
3491 api = 'http://api.justin.tv'
3492 video_id = mobj.group(mobj.lastindex)
3493 paged = False
3494 if mobj.lastindex == 1:
3495 paged = True
3496 api += '/channel/archives/%s.json'
3497 else:
3498 api += '/broadcast/by_archive/%s.json'
3499 api = api % (video_id,)
3500
3501 self.report_extraction(video_id)
3502
3503 info = []
3504 offset = 0
3505 limit = self._JUSTIN_PAGE_LIMIT
3506 while True:
3507 if paged:
3508 self.report_download_page(video_id, offset)
3509 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3510 page_count, page_info = self._parse_page(page_url)
3511 info.extend(page_info)
3512 if not paged or page_count != limit:
3513 break
3514 offset += limit
3515 return info
3516
3517 class FunnyOrDieIE(InfoExtractor):
3518 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3519
3520 def _real_extract(self, url):
3521 mobj = re.match(self._VALID_URL, url)
3522 if mobj is None:
3523 self._downloader.report_error(u'invalid URL: %s' % url)
3524 return
3525
3526 video_id = mobj.group('id')
3527 webpage = self._download_webpage(url, video_id)
3528
3529 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3530 if not m:
3531 self._downloader.report_error(u'unable to find video information')
3532 video_url = unescapeHTML(m.group('url'))
3533
3534 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3535 if not m:
3536 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3537 if not m:
3538 self._downloader.report_error(u'Cannot find video title')
3539 title = clean_html(m.group('title'))
3540
3541 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3542 if m:
3543 desc = unescapeHTML(m.group('desc'))
3544 else:
3545 desc = None
3546
3547 info = {
3548 'id': video_id,
3549 'url': video_url,
3550 'ext': 'mp4',
3551 'title': title,
3552 'description': desc,
3553 }
3554 return [info]
3555
3556 class SteamIE(InfoExtractor):
3557 _VALID_URL = r"""http://store.steampowered.com/
3558 (agecheck/)?
3559 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3560 (?P<gameID>\d+)/?
3561 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3562 """
3563
3564 @classmethod
3565 def suitable(cls, url):
3566 """Receives a URL and returns True if suitable for this IE."""
3567 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3568
3569 def _real_extract(self, url):
3570 m = re.match(self._VALID_URL, url, re.VERBOSE)
3571 gameID = m.group('gameID')
3572 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3573 self.report_age_confirmation()
3574 webpage = self._download_webpage(videourl, gameID)
3575 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3576
3577 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3578 mweb = re.finditer(urlRE, webpage)
3579 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3580 titles = re.finditer(namesRE, webpage)
3581 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3582 thumbs = re.finditer(thumbsRE, webpage)
3583 videos = []
3584 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3585 video_id = vid.group('videoID')
3586 title = vtitle.group('videoName')
3587 video_url = vid.group('videoURL')
3588 video_thumb = thumb.group('thumbnail')
3589 if not video_url:
3590 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3591 info = {
3592 'id':video_id,
3593 'url':video_url,
3594 'ext': 'flv',
3595 'title': unescapeHTML(title),
3596 'thumbnail': video_thumb
3597 }
3598 videos.append(info)
3599 return [self.playlist_result(videos, gameID, game_title)]
3600
3601 class UstreamIE(InfoExtractor):
3602 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3603 IE_NAME = u'ustream'
3604
3605 def _real_extract(self, url):
3606 m = re.match(self._VALID_URL, url)
3607 video_id = m.group('videoID')
3608 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3609 webpage = self._download_webpage(url, video_id)
3610 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3611 title = m.group('title')
3612 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3613 uploader = m.group('uploader')
3614 info = {
3615 'id':video_id,
3616 'url':video_url,
3617 'ext': 'flv',
3618 'title': title,
3619 'uploader': uploader
3620 }
3621 return [info]
3622
3623 class WorldStarHipHopIE(InfoExtractor):
3624 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3625 IE_NAME = u'WorldStarHipHop'
3626
3627 def _real_extract(self, url):
3628 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3629
3630 webpage_src = compat_urllib_request.urlopen(url).read()
3631 webpage_src = webpage_src.decode('utf-8')
3632
3633 mobj = re.search(_src_url, webpage_src)
3634
3635 m = re.match(self._VALID_URL, url)
3636 video_id = m.group('id')
3637
3638 if mobj is not None:
3639 video_url = mobj.group()
3640 if 'mp4' in video_url:
3641 ext = 'mp4'
3642 else:
3643 ext = 'flv'
3644 else:
3645 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3646 return
3647
3648 _title = r"""<title>(.*)</title>"""
3649
3650 mobj = re.search(_title, webpage_src)
3651
3652 if mobj is not None:
3653 title = mobj.group(1)
3654 else:
3655 title = 'World Start Hip Hop - %s' % time.ctime()
3656
3657 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3658 mobj = re.search(_thumbnail, webpage_src)
3659
3660 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3661 if mobj is not None:
3662 thumbnail = mobj.group(1)
3663 else:
3664 _title = r"""candytitles.*>(.*)</span>"""
3665 mobj = re.search(_title, webpage_src)
3666 if mobj is not None:
3667 title = mobj.group(1)
3668 thumbnail = None
3669
3670 results = [{
3671 'id': video_id,
3672 'url' : video_url,
3673 'title' : title,
3674 'thumbnail' : thumbnail,
3675 'ext' : ext,
3676 }]
3677 return results
3678
3679 class RBMARadioIE(InfoExtractor):
3680 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3681
3682 def _real_extract(self, url):
3683 m = re.match(self._VALID_URL, url)
3684 video_id = m.group('videoID')
3685
3686 webpage = self._download_webpage(url, video_id)
3687 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3688 if not m:
3689 raise ExtractorError(u'Cannot find metadata')
3690 json_data = m.group(1)
3691
3692 try:
3693 data = json.loads(json_data)
3694 except ValueError as e:
3695 raise ExtractorError(u'Invalid JSON: ' + str(e))
3696
3697 video_url = data['akamai_url'] + '&cbr=256'
3698 url_parts = compat_urllib_parse_urlparse(video_url)
3699 video_ext = url_parts.path.rpartition('.')[2]
3700 info = {
3701 'id': video_id,
3702 'url': video_url,
3703 'ext': video_ext,
3704 'title': data['title'],
3705 'description': data.get('teaser_text'),
3706 'location': data.get('country_of_origin'),
3707 'uploader': data.get('host', {}).get('name'),
3708 'uploader_id': data.get('host', {}).get('slug'),
3709 'thumbnail': data.get('image', {}).get('large_url_2x'),
3710 'duration': data.get('duration'),
3711 }
3712 return [info]
3713
3714
3715 class YouPornIE(InfoExtractor):
3716 """Information extractor for youporn.com."""
3717 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3718
3719 def _print_formats(self, formats):
3720 """Print all available formats"""
3721 print(u'Available formats:')
3722 print(u'ext\t\tformat')
3723 print(u'---------------------------------')
3724 for format in formats:
3725 print(u'%s\t\t%s' % (format['ext'], format['format']))
3726
3727 def _specific(self, req_format, formats):
3728 for x in formats:
3729 if(x["format"]==req_format):
3730 return x
3731 return None
3732
3733 def _real_extract(self, url):
3734 mobj = re.match(self._VALID_URL, url)
3735 if mobj is None:
3736 self._downloader.report_error(u'invalid URL: %s' % url)
3737 return
3738
3739 video_id = mobj.group('videoid')
3740
3741 req = compat_urllib_request.Request(url)
3742 req.add_header('Cookie', 'age_verified=1')
3743 webpage = self._download_webpage(req, video_id)
3744
3745 # Get the video title
3746 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3747 if result is None:
3748 raise ExtractorError(u'Unable to extract video title')
3749 video_title = result.group('title').strip()
3750
3751 # Get the video date
3752 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3753 if result is None:
3754 self._downloader.report_warning(u'unable to extract video date')
3755 upload_date = None
3756 else:
3757 upload_date = unified_strdate(result.group('date').strip())
3758
3759 # Get the video uploader
3760 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3761 if result is None:
3762 self._downloader.report_warning(u'unable to extract uploader')
3763 video_uploader = None
3764 else:
3765 video_uploader = result.group('uploader').strip()
3766 video_uploader = clean_html( video_uploader )
3767
3768 # Get all of the formats available
3769 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3770 result = re.search(DOWNLOAD_LIST_RE, webpage)
3771 if result is None:
3772 raise ExtractorError(u'Unable to extract download list')
3773 download_list_html = result.group('download_list').strip()
3774
3775 # Get all of the links from the page
3776 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3777 links = re.findall(LINK_RE, download_list_html)
3778 if(len(links) == 0):
3779 raise ExtractorError(u'ERROR: no known formats available for video')
3780
3781 self.to_screen(u'Links found: %d' % len(links))
3782
3783 formats = []
3784 for link in links:
3785
3786 # A link looks like this:
3787 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3788 # A path looks like this:
3789 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3790 video_url = unescapeHTML( link )
3791 path = compat_urllib_parse_urlparse( video_url ).path
3792 extension = os.path.splitext( path )[1][1:]
3793 format = path.split('/')[4].split('_')[:2]
3794 size = format[0]
3795 bitrate = format[1]
3796 format = "-".join( format )
3797 title = u'%s-%s-%s' % (video_title, size, bitrate)
3798
3799 formats.append({
3800 'id': video_id,
3801 'url': video_url,
3802 'uploader': video_uploader,
3803 'upload_date': upload_date,
3804 'title': title,
3805 'ext': extension,
3806 'format': format,
3807 'thumbnail': None,
3808 'description': None,
3809 'player_url': None
3810 })
3811
3812 if self._downloader.params.get('listformats', None):
3813 self._print_formats(formats)
3814 return
3815
3816 req_format = self._downloader.params.get('format', None)
3817 self.to_screen(u'Format: %s' % req_format)
3818
3819 if req_format is None or req_format == 'best':
3820 return [formats[0]]
3821 elif req_format == 'worst':
3822 return [formats[-1]]
3823 elif req_format in ('-1', 'all'):
3824 return formats
3825 else:
3826 format = self._specific( req_format, formats )
3827 if result is None:
3828 self._downloader.report_error(u'requested format not available')
3829 return
3830 return [format]
3831
3832
3833
3834 class PornotubeIE(InfoExtractor):
3835 """Information extractor for pornotube.com."""
3836 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3837
3838 def _real_extract(self, url):
3839 mobj = re.match(self._VALID_URL, url)
3840 if mobj is None:
3841 self._downloader.report_error(u'invalid URL: %s' % url)
3842 return
3843
3844 video_id = mobj.group('videoid')
3845 video_title = mobj.group('title')
3846
3847 # Get webpage content
3848 webpage = self._download_webpage(url, video_id)
3849
3850 # Get the video URL
3851 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3852 result = re.search(VIDEO_URL_RE, webpage)
3853 if result is None:
3854 self._downloader.report_error(u'unable to extract video url')
3855 return
3856 video_url = compat_urllib_parse.unquote(result.group('url'))
3857
3858 #Get the uploaded date
3859 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3860 result = re.search(VIDEO_UPLOADED_RE, webpage)
3861 if result is None:
3862 self._downloader.report_error(u'unable to extract video title')
3863 return
3864 upload_date = unified_strdate(result.group('date'))
3865
3866 info = {'id': video_id,
3867 'url': video_url,
3868 'uploader': None,
3869 'upload_date': upload_date,
3870 'title': video_title,
3871 'ext': 'flv',
3872 'format': 'flv'}
3873
3874 return [info]
3875
3876 class YouJizzIE(InfoExtractor):
3877 """Information extractor for youjizz.com."""
3878 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3879
3880 def _real_extract(self, url):
3881 mobj = re.match(self._VALID_URL, url)
3882 if mobj is None:
3883 self._downloader.report_error(u'invalid URL: %s' % url)
3884 return
3885
3886 video_id = mobj.group('videoid')
3887
3888 # Get webpage content
3889 webpage = self._download_webpage(url, video_id)
3890
3891 # Get the video title
3892 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3893 if result is None:
3894 raise ExtractorError(u'ERROR: unable to extract video title')
3895 video_title = result.group('title').strip()
3896
3897 # Get the embed page
3898 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3899 if result is None:
3900 raise ExtractorError(u'ERROR: unable to extract embed page')
3901
3902 embed_page_url = result.group(0).strip()
3903 video_id = result.group('videoid')
3904
3905 webpage = self._download_webpage(embed_page_url, video_id)
3906
3907 # Get the video URL
3908 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3909 if result is None:
3910 raise ExtractorError(u'ERROR: unable to extract video url')
3911 video_url = result.group('source')
3912
3913 info = {'id': video_id,
3914 'url': video_url,
3915 'title': video_title,
3916 'ext': 'flv',
3917 'format': 'flv',
3918 'player_url': embed_page_url}
3919
3920 return [info]
3921
3922 class EightTracksIE(InfoExtractor):
3923 IE_NAME = '8tracks'
3924 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3925
3926 def _real_extract(self, url):
3927 mobj = re.match(self._VALID_URL, url)
3928 if mobj is None:
3929 raise ExtractorError(u'Invalid URL: %s' % url)
3930 playlist_id = mobj.group('id')
3931
3932 webpage = self._download_webpage(url, playlist_id)
3933
3934 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3935 if not m:
3936 raise ExtractorError(u'Cannot find trax information')
3937 json_like = m.group(1)
3938 data = json.loads(json_like)
3939
3940 session = str(random.randint(0, 1000000000))
3941 mix_id = data['id']
3942 track_count = data['tracks_count']
3943 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3944 next_url = first_url
3945 res = []
3946 for i in itertools.count():
3947 api_json = self._download_webpage(next_url, playlist_id,
3948 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3949 errnote=u'Failed to download song information')
3950 api_data = json.loads(api_json)
3951 track_data = api_data[u'set']['track']
3952 info = {
3953 'id': track_data['id'],
3954 'url': track_data['track_file_stream_url'],
3955 'title': track_data['performer'] + u' - ' + track_data['name'],
3956 'raw_title': track_data['name'],
3957 'uploader_id': data['user']['login'],
3958 'ext': 'm4a',
3959 }
3960 res.append(info)
3961 if api_data['set']['at_last_track']:
3962 break
3963 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3964 return res
3965
3966 class KeekIE(InfoExtractor):
3967 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3968 IE_NAME = u'keek'
3969
3970 def _real_extract(self, url):
3971 m = re.match(self._VALID_URL, url)
3972 video_id = m.group('videoID')
3973 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3974 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3975 webpage = self._download_webpage(url, video_id)
3976 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3977 title = unescapeHTML(m.group('title'))
3978 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3979 uploader = clean_html(m.group('uploader'))
3980 info = {
3981 'id': video_id,
3982 'url': video_url,
3983 'ext': 'mp4',
3984 'title': title,
3985 'thumbnail': thumbnail,
3986 'uploader': uploader
3987 }
3988 return [info]
3989
3990 class TEDIE(InfoExtractor):
3991 _VALID_URL=r'''http://www.ted.com/
3992 (
3993 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3994 |
3995 ((?P<type_talk>talks)) # We have a simple talk
3996 )
3997 /(?P<name>\w+) # Here goes the name and then ".html"
3998 '''
3999
4000 @classmethod
4001 def suitable(cls, url):
4002 """Receives a URL and returns True if suitable for this IE."""
4003 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4004
4005 def _real_extract(self, url):
4006 m=re.match(self._VALID_URL, url, re.VERBOSE)
4007 if m.group('type_talk'):
4008 return [self._talk_info(url)]
4009 else :
4010 playlist_id=m.group('playlist_id')
4011 name=m.group('name')
4012 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4013 return [self._playlist_videos_info(url,name,playlist_id)]
4014
4015 def _talk_video_link(self,mediaSlug):
4016 '''Returns the video link for that mediaSlug'''
4017 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4018
4019 def _playlist_videos_info(self,url,name,playlist_id=0):
4020 '''Returns the videos of the playlist'''
4021 video_RE=r'''
4022 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4023 ([.\s]*?)data-playlist_item_id="(\d+)"
4024 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4025 '''
4026 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4027 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4028 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4029 m_names=re.finditer(video_name_RE,webpage)
4030
4031 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4032 m_playlist = re.search(playlist_RE, webpage)
4033 playlist_title = m_playlist.group('playlist_title')
4034
4035 playlist_entries = []
4036 for m_video, m_name in zip(m_videos,m_names):
4037 video_id=m_video.group('video_id')
4038 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4039 playlist_entries.append(self.url_result(talk_url, 'TED'))
4040 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4041
4042 def _talk_info(self, url, video_id=0):
4043 """Return the video for the talk in the url"""
4044 m=re.match(self._VALID_URL, url,re.VERBOSE)
4045 videoName=m.group('name')
4046 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4047 # If the url includes the language we get the title translated
4048 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4049 title=re.search(title_RE, webpage).group('title')
4050 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4051 "id":(?P<videoID>[\d]+).*?
4052 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4053 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4054 thumb_match=re.search(thumb_RE,webpage)
4055 info_match=re.search(info_RE,webpage,re.VERBOSE)
4056 video_id=info_match.group('videoID')
4057 mediaSlug=info_match.group('mediaSlug')
4058 video_url=self._talk_video_link(mediaSlug)
4059 info = {
4060 'id': video_id,
4061 'url': video_url,
4062 'ext': 'mp4',
4063 'title': title,
4064 'thumbnail': thumb_match.group('thumbnail')
4065 }
4066 return info
4067
4068 class MySpassIE(InfoExtractor):
4069 _VALID_URL = r'http://www.myspass.de/.*'
4070
4071 def _real_extract(self, url):
4072 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4073
4074 # video id is the last path element of the URL
4075 # usually there is a trailing slash, so also try the second but last
4076 url_path = compat_urllib_parse_urlparse(url).path
4077 url_parent_path, video_id = os.path.split(url_path)
4078 if not video_id:
4079 _, video_id = os.path.split(url_parent_path)
4080
4081 # get metadata
4082 metadata_url = META_DATA_URL_TEMPLATE % video_id
4083 metadata_text = self._download_webpage(metadata_url, video_id)
4084 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4085
4086 # extract values from metadata
4087 url_flv_el = metadata.find('url_flv')
4088 if url_flv_el is None:
4089 self._downloader.report_error(u'unable to extract download url')
4090 return
4091 video_url = url_flv_el.text
4092 extension = os.path.splitext(video_url)[1][1:]
4093 title_el = metadata.find('title')
4094 if title_el is None:
4095 self._downloader.report_error(u'unable to extract title')
4096 return
4097 title = title_el.text
4098 format_id_el = metadata.find('format_id')
4099 if format_id_el is None:
4100 format = ext
4101 else:
4102 format = format_id_el.text
4103 description_el = metadata.find('description')
4104 if description_el is not None:
4105 description = description_el.text
4106 else:
4107 description = None
4108 imagePreview_el = metadata.find('imagePreview')
4109 if imagePreview_el is not None:
4110 thumbnail = imagePreview_el.text
4111 else:
4112 thumbnail = None
4113 info = {
4114 'id': video_id,
4115 'url': video_url,
4116 'title': title,
4117 'ext': extension,
4118 'format': format,
4119 'thumbnail': thumbnail,
4120 'description': description
4121 }
4122 return [info]
4123
4124 class SpiegelIE(InfoExtractor):
4125 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4126
4127 def _real_extract(self, url):
4128 m = re.match(self._VALID_URL, url)
4129 video_id = m.group('videoID')
4130
4131 webpage = self._download_webpage(url, video_id)
4132 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4133 if not m:
4134 raise ExtractorError(u'Cannot find title')
4135 video_title = unescapeHTML(m.group(1))
4136
4137 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4138 xml_code = self._download_webpage(xml_url, video_id,
4139 note=u'Downloading XML', errnote=u'Failed to download XML')
4140
4141 idoc = xml.etree.ElementTree.fromstring(xml_code)
4142 last_type = idoc[-1]
4143 filename = last_type.findall('./filename')[0].text
4144 duration = float(last_type.findall('./duration')[0].text)
4145
4146 video_url = 'http://video2.spiegel.de/flash/' + filename
4147 video_ext = filename.rpartition('.')[2]
4148 info = {
4149 'id': video_id,
4150 'url': video_url,
4151 'ext': video_ext,
4152 'title': video_title,
4153 'duration': duration,
4154 }
4155 return [info]
4156
4157 class LiveLeakIE(InfoExtractor):
4158
4159 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4160 IE_NAME = u'liveleak'
4161
4162 def _real_extract(self, url):
4163 mobj = re.match(self._VALID_URL, url)
4164 if mobj is None:
4165 self._downloader.report_error(u'invalid URL: %s' % url)
4166 return
4167
4168 video_id = mobj.group('video_id')
4169
4170 webpage = self._download_webpage(url, video_id)
4171
4172 m = re.search(r'file: "(.*?)",', webpage)
4173 if not m:
4174 self._downloader.report_error(u'unable to find video url')
4175 return
4176 video_url = m.group(1)
4177
4178 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4179 if not m:
4180 self._downloader.report_error(u'Cannot find video title')
4181 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4182
4183 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4184 if m:
4185 desc = unescapeHTML(m.group('desc'))
4186 else:
4187 desc = None
4188
4189 m = re.search(r'By:.*?(\w+)</a>', webpage)
4190 if m:
4191 uploader = clean_html(m.group(1))
4192 else:
4193 uploader = None
4194
4195 info = {
4196 'id': video_id,
4197 'url': video_url,
4198 'ext': 'mp4',
4199 'title': title,
4200 'description': desc,
4201 'uploader': uploader
4202 }
4203
4204 return [info]
4205
4206 class ARDIE(InfoExtractor):
4207 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4208 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4209 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4210
4211 def _real_extract(self, url):
4212 # determine video id from url
4213 m = re.match(self._VALID_URL, url)
4214
4215 numid = re.search(r'documentId=([0-9]+)', url)
4216 if numid:
4217 video_id = numid.group(1)
4218 else:
4219 video_id = m.group('video_id')
4220
4221 # determine title and media streams from webpage
4222 html = self._download_webpage(url, video_id)
4223 title = re.search(self._TITLE, html).group('title')
4224 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4225 if not streams:
4226 assert '"fsk"' in html
4227 self._downloader.report_error(u'this video is only available after 8:00 pm')
4228 return
4229
4230 # choose default media type and highest quality for now
4231 stream = max([s for s in streams if int(s["media_type"]) == 0],
4232 key=lambda s: int(s["quality"]))
4233
4234 # there's two possibilities: RTMP stream or HTTP download
4235 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4236 if stream['rtmp_url']:
4237 self.to_screen(u'RTMP download detected')
4238 assert stream['video_url'].startswith('mp4:')
4239 info["url"] = stream["rtmp_url"]
4240 info["play_path"] = stream['video_url']
4241 else:
4242 assert stream["video_url"].endswith('.mp4')
4243 info["url"] = stream["video_url"]
4244 return [info]
4245
4246 class TumblrIE(InfoExtractor):
4247 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4248
4249 def _real_extract(self, url):
4250 m_url = re.match(self._VALID_URL, url)
4251 video_id = m_url.group('id')
4252 blog = m_url.group('blog_name')
4253
4254 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4255 webpage = self._download_webpage(url, video_id)
4256
4257 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4258 video = re.search(re_video, webpage)
4259 if video is None:
4260 self.to_screen("No video founded")
4261 return []
4262 video_url = video.group('video_url')
4263 ext = video.group('ext')
4264
4265 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4266 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4267
4268 # The only place where you can get a title, it's not complete,
4269 # but searching in other places doesn't work for all videos
4270 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4271 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4272
4273 return [{'id': video_id,
4274 'url': video_url,
4275 'title': title,
4276 'thumbnail': thumb,
4277 'ext': ext
4278 }]
4279
4280
4281 def gen_extractors():
4282 """ Return a list of an instance of every supported extractor.
4283 The order does matter; the first extractor matched is the one handling the URL.
4284 """
4285 return [
4286 YoutubePlaylistIE(),
4287 YoutubeChannelIE(),
4288 YoutubeUserIE(),
4289 YoutubeSearchIE(),
4290 YoutubeIE(),
4291 MetacafeIE(),
4292 DailymotionIE(),
4293 GoogleSearchIE(),
4294 PhotobucketIE(),
4295 YahooIE(),
4296 YahooSearchIE(),
4297 DepositFilesIE(),
4298 FacebookIE(),
4299 BlipTVUserIE(),
4300 BlipTVIE(),
4301 VimeoIE(),
4302 MyVideoIE(),
4303 ComedyCentralIE(),
4304 EscapistIE(),
4305 CollegeHumorIE(),
4306 XVideosIE(),
4307 SoundcloudSetIE(),
4308 SoundcloudIE(),
4309 InfoQIE(),
4310 MixcloudIE(),
4311 StanfordOpenClassroomIE(),
4312 MTVIE(),
4313 YoukuIE(),
4314 XNXXIE(),
4315 YouJizzIE(),
4316 PornotubeIE(),
4317 YouPornIE(),
4318 GooglePlusIE(),
4319 ArteTvIE(),
4320 NBAIE(),
4321 WorldStarHipHopIE(),
4322 JustinTVIE(),
4323 FunnyOrDieIE(),
4324 SteamIE(),
4325 UstreamIE(),
4326 RBMARadioIE(),
4327 EightTracksIE(),
4328 KeekIE(),
4329 TEDIE(),
4330 MySpassIE(),
4331 SpiegelIE(),
4332 LiveLeakIE(),
4333 ARDIE(),
4334 TumblrIE(),
4335 GenericIE()
4336 ]
4337
4338 def get_info_extractor(ie_name):
4339 """Returns the info extractor class with the given ie_name"""
4340 return globals()[ie_name+'IE']