]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Use extract_info in BlipTV User and Youtube Channel
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119 try:
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122 if errnote is None:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 webpage_bytes = urlh.read()
130 return webpage_bytes.decode('utf-8', 'replace')
131
132
133 class YoutubeIE(InfoExtractor):
134 """Information extractor for youtube.com."""
135
136 _VALID_URL = r"""^
137 (
138 (?:https?://)? # http(s):// (optional)
139 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
150 )? # optional -> youtube.com/xxxx is OK
151 )? # all until now is optional -> you can pass the naked ID
152 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
153 (?(1).+)? # if we found the ID, everything can follow
154 $"""
155 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _NETRC_MACHINE = 'youtube'
160 # Listed in order of quality
161 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163 _video_extensions = {
164 '13': '3gp',
165 '17': 'mp4',
166 '18': 'mp4',
167 '22': 'mp4',
168 '37': 'mp4',
169 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
170 '43': 'webm',
171 '44': 'webm',
172 '45': 'webm',
173 '46': 'webm',
174 }
175 _video_dimensions = {
176 '5': '240x400',
177 '6': '???',
178 '13': '???',
179 '17': '144x176',
180 '18': '360x640',
181 '22': '720x1280',
182 '34': '360x640',
183 '35': '480x854',
184 '37': '1080x1920',
185 '38': '3072x4096',
186 '43': '360x640',
187 '44': '480x854',
188 '45': '720x1280',
189 '46': '1080x1920',
190 }
191 IE_NAME = u'youtube'
192
193 @classmethod
194 def suitable(cls, url):
195 """Receives a URL and returns True if suitable for this IE."""
196 if YoutubePlaylistIE.suitable(url): return False
197 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
198
199 def report_lang(self):
200 """Report attempt to set language."""
201 self._downloader.to_screen(u'[youtube] Setting language')
202
203 def report_login(self):
204 """Report attempt to log in."""
205 self._downloader.to_screen(u'[youtube] Logging in')
206
207 def report_age_confirmation(self):
208 """Report attempt to confirm age."""
209 self._downloader.to_screen(u'[youtube] Confirming age')
210
211 def report_video_webpage_download(self, video_id):
212 """Report attempt to download video webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
214
215 def report_video_info_webpage_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
218
219 def report_video_subtitles_download(self, video_id):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
222
223 def report_information_extraction(self, video_id):
224 """Report attempt to extract video information."""
225 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
226
227 def report_unavailable_format(self, video_id, format):
228 """Report extracted video URL."""
229 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
230
231 def report_rtmp_download(self):
232 """Indicate the download will use the RTMP protocol."""
233 self._downloader.to_screen(u'[youtube] RTMP download detected')
234
235 def _closed_captions_xml_to_srt(self, xml_string):
236 srt = ''
237 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238 # TODO parse xml instead of regex
239 for n, (start, dur_tag, dur, caption) in enumerate(texts):
240 if not dur: dur = '4'
241 start = float(start)
242 end = start + float(dur)
243 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245 caption = unescapeHTML(caption)
246 caption = unescapeHTML(caption) # double cycle, intentional
247 srt += str(n+1) + '\n'
248 srt += start + ' --> ' + end + '\n'
249 srt += caption + '\n\n'
250 return srt
251
252 def _extract_subtitles(self, video_id):
253 self.report_video_subtitles_download(video_id)
254 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
255 try:
256 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261 if not srt_lang_list:
262 return (u'WARNING: video has no closed captions', None)
263 if self._downloader.params.get('subtitleslang', False):
264 srt_lang = self._downloader.params.get('subtitleslang')
265 elif 'en' in srt_lang_list:
266 srt_lang = 'en'
267 else:
268 srt_lang = list(srt_lang_list.keys())[0]
269 if not srt_lang in srt_lang_list:
270 return (u'WARNING: no closed captions found in the specified language', None)
271 params = compat_urllib_parse.urlencode({
272 'lang': srt_lang,
273 'name': srt_lang_list[srt_lang].encode('utf-8'),
274 'v': video_id,
275 })
276 url = 'http://www.youtube.com/api/timedtext?' + params
277 try:
278 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281 if not srt_xml:
282 return (u'WARNING: Did not fetch video subtitles', None)
283 return (None, self._closed_captions_xml_to_srt(srt_xml))
284
285 def _print_formats(self, formats):
286 print('Available formats:')
287 for x in formats:
288 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
289
290 def _real_initialize(self):
291 if self._downloader is None:
292 return
293
294 username = None
295 password = None
296 downloader_params = self._downloader.params
297
298 # Attempt to use provided username and password or .netrc data
299 if downloader_params.get('username', None) is not None:
300 username = downloader_params['username']
301 password = downloader_params['password']
302 elif downloader_params.get('usenetrc', False):
303 try:
304 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 if info is not None:
306 username = info[0]
307 password = info[2]
308 else:
309 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310 except (IOError, netrc.NetrcParseError) as err:
311 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
312 return
313
314 # Set language
315 request = compat_urllib_request.Request(self._LANG_URL)
316 try:
317 self.report_lang()
318 compat_urllib_request.urlopen(request).read()
319 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
321 return
322
323 # No authentication to be performed
324 if username is None:
325 return
326
327 request = compat_urllib_request.Request(self._LOGIN_URL)
328 try:
329 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
332 return
333
334 galx = None
335 dsh = None
336 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
337 if match:
338 galx = match.group(1)
339
340 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
341 if match:
342 dsh = match.group(1)
343
344 # Log in
345 login_form_strs = {
346 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
347 u'Email': username,
348 u'GALX': galx,
349 u'Passwd': password,
350 u'PersistentCookie': u'yes',
351 u'_utf8': u'霱',
352 u'bgresponse': u'js_disabled',
353 u'checkConnection': u'',
354 u'checkedDomains': u'youtube',
355 u'dnConn': u'',
356 u'dsh': dsh,
357 u'pstMsg': u'0',
358 u'rmShown': u'1',
359 u'secTok': u'',
360 u'signIn': u'Sign in',
361 u'timeStmp': u'',
362 u'service': u'youtube',
363 u'uilel': u'3',
364 u'hl': u'en_US',
365 }
366 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
367 # chokes on unicode
368 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
371 try:
372 self.report_login()
373 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375 self._downloader.report_warning(u'unable to log in: bad username or password')
376 return
377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
379 return
380
381 # Confirm age
382 age_form = {
383 'next_url': '/',
384 'action_confirm': 'Confirm',
385 }
386 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
387 try:
388 self.report_age_confirmation()
389 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
392 return
393
394 def _extract_id(self, url):
395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
396 if mobj is None:
397 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
398 return
399 video_id = mobj.group(2)
400 return video_id
401
402 def _real_extract(self, url):
403 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404 mobj = re.search(self._NEXT_URL_RE, url)
405 if mobj:
406 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407 video_id = self._extract_id(url)
408
409 # Get video webpage
410 self.report_video_webpage_download(video_id)
411 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412 request = compat_urllib_request.Request(url)
413 try:
414 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
417 return
418
419 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
420
421 # Attempt to extract SWF player URL
422 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
423 if mobj is not None:
424 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 else:
426 player_url = None
427
428 # Get video info
429 self.report_video_info_webpage_download(video_id)
430 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432 % (video_id, el_type))
433 request = compat_urllib_request.Request(video_info_url)
434 try:
435 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437 video_info = compat_parse_qs(video_info_webpage)
438 if 'token' in video_info:
439 break
440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
442 return
443 if 'token' not in video_info:
444 if 'reason' in video_info:
445 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
446 else:
447 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
448 return
449
450 # Check for "rental" videos
451 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452 self._downloader.trouble(u'ERROR: "rental" videos not supported')
453 return
454
455 # Start extracting information
456 self.report_information_extraction(video_id)
457
458 # uploader
459 if 'author' not in video_info:
460 self._downloader.trouble(u'ERROR: unable to extract uploader name')
461 return
462 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
463
464 # uploader_id
465 video_uploader_id = None
466 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
467 if mobj is not None:
468 video_uploader_id = mobj.group(1)
469 else:
470 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
471
472 # title
473 if 'title' not in video_info:
474 self._downloader.trouble(u'ERROR: unable to extract video title')
475 return
476 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
477
478 # thumbnail image
479 if 'thumbnail_url' not in video_info:
480 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
481 video_thumbnail = ''
482 else: # don't panic if we can't find it
483 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
484
485 # upload date
486 upload_date = None
487 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
488 if mobj is not None:
489 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491 for expression in format_expressions:
492 try:
493 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 except:
495 pass
496
497 # description
498 video_description = get_element_by_id("eow-description", video_webpage)
499 if video_description:
500 video_description = clean_html(video_description)
501 else:
502 video_description = ''
503
504 # closed captions
505 video_subtitles = None
506 if self._downloader.params.get('writesubtitles', False):
507 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
508 if srt_error:
509 self._downloader.trouble(srt_error)
510
511 if 'length_seconds' not in video_info:
512 self._downloader.trouble(u'WARNING: unable to extract video duration')
513 video_duration = ''
514 else:
515 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
516
517 # token
518 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
519
520 # Decide which formats to download
521 req_format = self._downloader.params.get('format', None)
522
523 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524 self.report_rtmp_download()
525 video_url_list = [(None, video_info['conn'][0])]
526 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
531
532 format_limit = self._downloader.params.get('format_limit', None)
533 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534 if format_limit is not None and format_limit in available_formats:
535 format_list = available_formats[available_formats.index(format_limit):]
536 else:
537 format_list = available_formats
538 existing_formats = [x for x in format_list if x in url_map]
539 if len(existing_formats) == 0:
540 self._downloader.trouble(u'ERROR: no known formats available for video')
541 return
542 if self._downloader.params.get('listformats', None):
543 self._print_formats(existing_formats)
544 return
545 if req_format is None or req_format == 'best':
546 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547 elif req_format == 'worst':
548 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549 elif req_format in ('-1', 'all'):
550 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
551 else:
552 # Specific formats. We pick the first in a slash-delimeted sequence.
553 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554 req_formats = req_format.split('/')
555 video_url_list = None
556 for rf in req_formats:
557 if rf in url_map:
558 video_url_list = [(rf, url_map[rf])]
559 break
560 if video_url_list is None:
561 self._downloader.trouble(u'ERROR: requested format not available')
562 return
563 else:
564 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
565 return
566
567 results = []
568 for format_param, video_real_url in video_url_list:
569 # Extension
570 video_extension = self._video_extensions.get(format_param, 'flv')
571
572 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573 self._video_dimensions.get(format_param, '???'))
574
575 results.append({
576 'id': video_id,
577 'url': video_real_url,
578 'uploader': video_uploader,
579 'uploader_id': video_uploader_id,
580 'upload_date': upload_date,
581 'title': video_title,
582 'ext': video_extension,
583 'format': video_format,
584 'thumbnail': video_thumbnail,
585 'description': video_description,
586 'player_url': player_url,
587 'subtitles': video_subtitles,
588 'duration': video_duration
589 })
590 return results
591
592
593 class MetacafeIE(InfoExtractor):
594 """Information Extractor for metacafe.com."""
595
596 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599 IE_NAME = u'metacafe'
600
601 def __init__(self, downloader=None):
602 InfoExtractor.__init__(self, downloader)
603
604 def report_disclaimer(self):
605 """Report disclaimer retrieval."""
606 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
607
608 def report_age_confirmation(self):
609 """Report attempt to confirm age."""
610 self._downloader.to_screen(u'[metacafe] Confirming age')
611
612 def report_download_webpage(self, video_id):
613 """Report webpage download."""
614 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
615
616 def report_extraction(self, video_id):
617 """Report information extraction."""
618 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
619
620 def _real_initialize(self):
621 # Retrieve disclaimer
622 request = compat_urllib_request.Request(self._DISCLAIMER)
623 try:
624 self.report_disclaimer()
625 disclaimer = compat_urllib_request.urlopen(request).read()
626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
628 return
629
630 # Confirm age
631 disclaimer_form = {
632 'filters': '0',
633 'submit': "Continue - I'm over 18",
634 }
635 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
636 try:
637 self.report_age_confirmation()
638 disclaimer = compat_urllib_request.urlopen(request).read()
639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
641 return
642
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
646 if mobj is None:
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648 return
649
650 video_id = mobj.group(1)
651
652 # Check if video comes from YouTube
653 mobj2 = re.match(r'^yt-(.*)$', video_id)
654 if mobj2 is not None:
655 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
656 return
657
658 # Retrieve video webpage to extract further information
659 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
660 try:
661 self.report_download_webpage(video_id)
662 webpage = compat_urllib_request.urlopen(request).read()
663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665 return
666
667 # Extract URL, uploader and title from webpage
668 self.report_extraction(video_id)
669 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
670 if mobj is not None:
671 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672 video_extension = mediaURL[-3:]
673
674 # Extract gdaKey if available
675 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
676 if mobj is None:
677 video_url = mediaURL
678 else:
679 gdaKey = mobj.group(1)
680 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
681 else:
682 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
683 if mobj is None:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
685 return
686 vardict = compat_parse_qs(mobj.group(1))
687 if 'mediaData' not in vardict:
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
689 return
690 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
691 if mobj is None:
692 self._downloader.trouble(u'ERROR: unable to extract media URL')
693 return
694 mediaURL = mobj.group(1).replace('\\/', '/')
695 video_extension = mediaURL[-3:]
696 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
697
698 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
699 if mobj is None:
700 self._downloader.trouble(u'ERROR: unable to extract title')
701 return
702 video_title = mobj.group(1).decode('utf-8')
703
704 mobj = re.search(r'submitter=(.*?);', webpage)
705 if mobj is None:
706 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
707 return
708 video_uploader = mobj.group(1)
709
710 return [{
711 'id': video_id.decode('utf-8'),
712 'url': video_url.decode('utf-8'),
713 'uploader': video_uploader.decode('utf-8'),
714 'upload_date': None,
715 'title': video_title,
716 'ext': video_extension.decode('utf-8'),
717 }]
718
719
720 class DailymotionIE(InfoExtractor):
721 """Information Extractor for Dailymotion"""
722
723 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724 IE_NAME = u'dailymotion'
725 _WORKING = False
726
727 def __init__(self, downloader=None):
728 InfoExtractor.__init__(self, downloader)
729
730 def report_extraction(self, video_id):
731 """Report information extraction."""
732 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
733
734 def _real_extract(self, url):
735 # Extract id and simplified title from URL
736 mobj = re.match(self._VALID_URL, url)
737 if mobj is None:
738 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
739 return
740
741 video_id = mobj.group(1).split('_')[0].split('?')[0]
742
743 video_extension = 'mp4'
744
745 # Retrieve video webpage to extract further information
746 request = compat_urllib_request.Request(url)
747 request.add_header('Cookie', 'family_filter=off')
748 webpage = self._download_webpage(request, video_id)
749
750 # Extract URL, uploader and title from webpage
751 self.report_extraction(video_id)
752 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
753 if mobj is None:
754 self._downloader.trouble(u'ERROR: unable to extract media URL')
755 return
756 flashvars = compat_urllib_parse.unquote(mobj.group(1))
757
758 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
759 if key in flashvars:
760 max_quality = key
761 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
762 break
763 else:
764 self._downloader.trouble(u'ERROR: unable to extract video URL')
765 return
766
767 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
768 if mobj is None:
769 self._downloader.trouble(u'ERROR: unable to extract video URL')
770 return
771
772 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
773
774 # TODO: support choosing qualities
775
776 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
777 if mobj is None:
778 self._downloader.trouble(u'ERROR: unable to extract title')
779 return
780 video_title = unescapeHTML(mobj.group('title'))
781
782 video_uploader = None
783 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
784 if mobj is None:
785 # lookin for official user
786 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787 if mobj_official is None:
788 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
789 else:
790 video_uploader = mobj_official.group(1)
791 else:
792 video_uploader = mobj.group(1)
793
794 video_upload_date = None
795 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
796 if mobj is not None:
797 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798
799 return [{
800 'id': video_id,
801 'url': video_url,
802 'uploader': video_uploader,
803 'upload_date': video_upload_date,
804 'title': video_title,
805 'ext': video_extension,
806 }]
807
808
809 class PhotobucketIE(InfoExtractor):
810 """Information extractor for photobucket.com."""
811
812 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813 IE_NAME = u'photobucket'
814
815 def __init__(self, downloader=None):
816 InfoExtractor.__init__(self, downloader)
817
818 def report_download_webpage(self, video_id):
819 """Report webpage download."""
820 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
821
822 def report_extraction(self, video_id):
823 """Report information extraction."""
824 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
825
826 def _real_extract(self, url):
827 # Extract id from URL
828 mobj = re.match(self._VALID_URL, url)
829 if mobj is None:
830 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
831 return
832
833 video_id = mobj.group(1)
834
835 video_extension = 'flv'
836
837 # Retrieve video webpage to extract further information
838 request = compat_urllib_request.Request(url)
839 try:
840 self.report_download_webpage(video_id)
841 webpage = compat_urllib_request.urlopen(request).read()
842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
844 return
845
846 # Extract URL, uploader, and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
849 if mobj is None:
850 self._downloader.trouble(u'ERROR: unable to extract media URL')
851 return
852 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
853
854 video_url = mediaURL
855
856 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
857 if mobj is None:
858 self._downloader.trouble(u'ERROR: unable to extract title')
859 return
860 video_title = mobj.group(1).decode('utf-8')
861
862 video_uploader = mobj.group(2).decode('utf-8')
863
864 return [{
865 'id': video_id.decode('utf-8'),
866 'url': video_url.decode('utf-8'),
867 'uploader': video_uploader,
868 'upload_date': None,
869 'title': video_title,
870 'ext': video_extension.decode('utf-8'),
871 }]
872
873
874 class YahooIE(InfoExtractor):
875 """Information extractor for video.yahoo.com."""
876
877 _WORKING = False
878 # _VALID_URL matches all Yahoo! Video URLs
879 # _VPAGE_URL matches only the extractable '/watch/' URLs
880 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882 IE_NAME = u'video.yahoo'
883
884 def __init__(self, downloader=None):
885 InfoExtractor.__init__(self, downloader)
886
887 def report_download_webpage(self, video_id):
888 """Report webpage download."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
890
891 def report_extraction(self, video_id):
892 """Report information extraction."""
893 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
894
895 def _real_extract(self, url, new_video=True):
896 # Extract ID from URL
897 mobj = re.match(self._VALID_URL, url)
898 if mobj is None:
899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
900 return
901
902 video_id = mobj.group(2)
903 video_extension = 'flv'
904
905 # Rewrite valid but non-extractable URLs as
906 # extractable English language /watch/ URLs
907 if re.match(self._VPAGE_URL, url) is None:
908 request = compat_urllib_request.Request(url)
909 try:
910 webpage = compat_urllib_request.urlopen(request).read()
911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
913 return
914
915 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
916 if mobj is None:
917 self._downloader.trouble(u'ERROR: Unable to extract id field')
918 return
919 yahoo_id = mobj.group(1)
920
921 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
922 if mobj is None:
923 self._downloader.trouble(u'ERROR: Unable to extract vid field')
924 return
925 yahoo_vid = mobj.group(1)
926
927 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928 return self._real_extract(url, new_video=False)
929
930 # Retrieve video webpage to extract further information
931 request = compat_urllib_request.Request(url)
932 try:
933 self.report_download_webpage(video_id)
934 webpage = compat_urllib_request.urlopen(request).read()
935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
937 return
938
939 # Extract uploader and title from webpage
940 self.report_extraction(video_id)
941 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
942 if mobj is None:
943 self._downloader.trouble(u'ERROR: unable to extract video title')
944 return
945 video_title = mobj.group(1).decode('utf-8')
946
947 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
948 if mobj is None:
949 self._downloader.trouble(u'ERROR: unable to extract video uploader')
950 return
951 video_uploader = mobj.group(1).decode('utf-8')
952
953 # Extract video thumbnail
954 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
955 if mobj is None:
956 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
957 return
958 video_thumbnail = mobj.group(1).decode('utf-8')
959
960 # Extract video description
961 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
962 if mobj is None:
963 self._downloader.trouble(u'ERROR: unable to extract video description')
964 return
965 video_description = mobj.group(1).decode('utf-8')
966 if not video_description:
967 video_description = 'No description available.'
968
969 # Extract video height and width
970 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
971 if mobj is None:
972 self._downloader.trouble(u'ERROR: unable to extract video height')
973 return
974 yv_video_height = mobj.group(1)
975
976 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
977 if mobj is None:
978 self._downloader.trouble(u'ERROR: unable to extract video width')
979 return
980 yv_video_width = mobj.group(1)
981
982 # Retrieve video playlist to extract media URL
983 # I'm not completely sure what all these options are, but we
984 # seem to need most of them, otherwise the server sends a 401.
985 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
986 yv_bitrate = '700' # according to Wikipedia this is hard-coded
987 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
990 try:
991 self.report_download_webpage(video_id)
992 webpage = compat_urllib_request.urlopen(request).read()
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995 return
996
997 # Extract media URL from playlist XML
998 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
999 if mobj is None:
1000 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001 return
1002 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003 video_url = unescapeHTML(video_url)
1004
1005 return [{
1006 'id': video_id.decode('utf-8'),
1007 'url': video_url,
1008 'uploader': video_uploader,
1009 'upload_date': None,
1010 'title': video_title,
1011 'ext': video_extension.decode('utf-8'),
1012 'thumbnail': video_thumbnail.decode('utf-8'),
1013 'description': video_description,
1014 }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1019
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022 IE_NAME = u'vimeo'
1023
1024 def __init__(self, downloader=None):
1025 InfoExtractor.__init__(self, downloader)
1026
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035 def _real_extract(self, url, new_video=True):
1036 # Extract ID from URL
1037 mobj = re.match(self._VALID_URL, url)
1038 if mobj is None:
1039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040 return
1041
1042 video_id = mobj.group('id')
1043 if not mobj.group('proto'):
1044 url = 'https://' + url
1045 if mobj.group('direct_link'):
1046 url = 'https://vimeo.com/' + video_id
1047
1048 # Retrieve video webpage to extract further information
1049 request = compat_urllib_request.Request(url, None, std_headers)
1050 try:
1051 self.report_download_webpage(video_id)
1052 webpage_bytes = compat_urllib_request.urlopen(request).read()
1053 webpage = webpage_bytes.decode('utf-8')
1054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056 return
1057
1058 # Now we begin extracting as much information as we can from what we
1059 # retrieved. First we extract the information common to all extractors,
1060 # and latter we extract those that are Vimeo specific.
1061 self.report_extraction(video_id)
1062
1063 # Extract the config JSON
1064 try:
1065 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066 config = json.loads(config)
1067 except:
1068 self._downloader.trouble(u'ERROR: unable to extract info section')
1069 return
1070
1071 # Extract title
1072 video_title = config["video"]["title"]
1073
1074 # Extract uploader and uploader_id
1075 video_uploader = config["video"]["owner"]["name"]
1076 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078 # Extract video thumbnail
1079 video_thumbnail = config["video"]["thumbnail"]
1080
1081 # Extract video description
1082 video_description = get_element_by_attribute("itemprop", "description", webpage)
1083 if video_description: video_description = clean_html(video_description)
1084 else: video_description = ''
1085
1086 # Extract upload date
1087 video_upload_date = None
1088 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089 if mobj is not None:
1090 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092 # Vimeo specific: extract request signature and timestamp
1093 sig = config['request']['signature']
1094 timestamp = config['request']['timestamp']
1095
1096 # Vimeo specific: extract video codec and quality information
1097 # First consider quality, then codecs, then take everything
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 files = { 'hd': [], 'sd': [], 'other': []}
1101 for codec_name, codec_extension in codecs:
1102 if codec_name in config["video"]["files"]:
1103 if 'hd' in config["video"]["files"][codec_name]:
1104 files['hd'].append((codec_name, codec_extension, 'hd'))
1105 elif 'sd' in config["video"]["files"][codec_name]:
1106 files['sd'].append((codec_name, codec_extension, 'sd'))
1107 else:
1108 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110 for quality in ('hd', 'sd', 'other'):
1111 if len(files[quality]) > 0:
1112 video_quality = files[quality][0][2]
1113 video_codec = files[quality][0][0]
1114 video_extension = files[quality][0][1]
1115 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116 break
1117 else:
1118 self._downloader.trouble(u'ERROR: no known codec found')
1119 return
1120
1121 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124 return [{
1125 'id': video_id,
1126 'url': video_url,
1127 'uploader': video_uploader,
1128 'uploader_id': video_uploader_id,
1129 'upload_date': video_upload_date,
1130 'title': video_title,
1131 'ext': video_extension,
1132 'thumbnail': video_thumbnail,
1133 'description': video_description,
1134 }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138 """arte.tv information extractor."""
1139
1140 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141 _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143 IE_NAME = u'arte.tv'
1144
1145 def __init__(self, downloader=None):
1146 InfoExtractor.__init__(self, downloader)
1147
1148 def report_download_webpage(self, video_id):
1149 """Report webpage download."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152 def report_extraction(self, video_id):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156 def fetch_webpage(self, url):
1157 request = compat_urllib_request.Request(url)
1158 try:
1159 self.report_download_webpage(url)
1160 webpage = compat_urllib_request.urlopen(request).read()
1161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163 return
1164 except ValueError as err:
1165 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166 return
1167 return webpage
1168
1169 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170 page = self.fetch_webpage(url)
1171 mobj = re.search(regex, page, regexFlags)
1172 info = {}
1173
1174 if mobj is None:
1175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176 return
1177
1178 for (i, key, err) in matchTuples:
1179 if mobj.group(i) is None:
1180 self._downloader.trouble(err)
1181 return
1182 else:
1183 info[key] = mobj.group(i)
1184
1185 return info
1186
1187 def extractLiveStream(self, url):
1188 video_lang = url.split('/')[-4]
1189 info = self.grep_webpage(
1190 url,
1191 r'src="(.*?/videothek_js.*?\.js)',
1192 0,
1193 [
1194 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195 ]
1196 )
1197 http_host = url.split('/')[2]
1198 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199 info = self.grep_webpage(
1200 next_url,
1201 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202 '(http://.*?\.swf).*?' +
1203 '(rtmp://.*?)\'',
1204 re.DOTALL,
1205 [
1206 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1207 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1209 ]
1210 )
1211 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213 def extractPlus7Stream(self, url):
1214 video_lang = url.split('/')[-3]
1215 info = self.grep_webpage(
1216 url,
1217 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218 0,
1219 [
1220 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221 ]
1222 )
1223 next_url = compat_urllib_parse.unquote(info.get('url'))
1224 info = self.grep_webpage(
1225 next_url,
1226 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227 0,
1228 [
1229 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230 ]
1231 )
1232 next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234 info = self.grep_webpage(
1235 next_url,
1236 r'<video id="(.*?)".*?>.*?' +
1237 '<name>(.*?)</name>.*?' +
1238 '<dateVideo>(.*?)</dateVideo>.*?' +
1239 '<url quality="hd">(.*?)</url>',
1240 re.DOTALL,
1241 [
1242 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1243 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1245 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1246 ]
1247 )
1248
1249 return {
1250 'id': info.get('id'),
1251 'url': compat_urllib_parse.unquote(info.get('url')),
1252 'uploader': u'arte.tv',
1253 'upload_date': info.get('date'),
1254 'title': info.get('title').decode('utf-8'),
1255 'ext': u'mp4',
1256 'format': u'NA',
1257 'player_url': None,
1258 }
1259
1260 def _real_extract(self, url):
1261 video_id = url.split('/')[-1]
1262 self.report_extraction(video_id)
1263
1264 if re.search(self._LIVE_URL, video_id) is not None:
1265 self.extractLiveStream(url)
1266 return
1267 else:
1268 info = self.extractPlus7Stream(url)
1269
1270 return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274 """Generic last-resort information extractor."""
1275
1276 _VALID_URL = r'.*'
1277 IE_NAME = u'generic'
1278
1279 def __init__(self, downloader=None):
1280 InfoExtractor.__init__(self, downloader)
1281
1282 def report_download_webpage(self, video_id):
1283 """Report webpage download."""
1284 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287 def report_extraction(self, video_id):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291 def report_following_redirect(self, new_url):
1292 """Report information extraction."""
1293 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295 def _test_redirect(self, url):
1296 """Check if it is a redirect, like url shorteners, in case restart chain."""
1297 class HeadRequest(compat_urllib_request.Request):
1298 def get_method(self):
1299 return "HEAD"
1300
1301 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302 """
1303 Subclass the HTTPRedirectHandler to make it use our
1304 HeadRequest also on the redirected URL
1305 """
1306 def redirect_request(self, req, fp, code, msg, headers, newurl):
1307 if code in (301, 302, 303, 307):
1308 newurl = newurl.replace(' ', '%20')
1309 newheaders = dict((k,v) for k,v in req.headers.items()
1310 if k.lower() not in ("content-length", "content-type"))
1311 return HeadRequest(newurl,
1312 headers=newheaders,
1313 origin_req_host=req.get_origin_req_host(),
1314 unverifiable=True)
1315 else:
1316 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319 """
1320 Fallback to GET if HEAD is not allowed (405 HTTP error)
1321 """
1322 def http_error_405(self, req, fp, code, msg, headers):
1323 fp.read()
1324 fp.close()
1325
1326 newheaders = dict((k,v) for k,v in req.headers.items()
1327 if k.lower() not in ("content-length", "content-type"))
1328 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329 headers=newheaders,
1330 origin_req_host=req.get_origin_req_host(),
1331 unverifiable=True))
1332
1333 # Build our opener
1334 opener = compat_urllib_request.OpenerDirector()
1335 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336 HTTPMethodFallback, HEADRedirectHandler,
1337 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338 opener.add_handler(handler())
1339
1340 response = opener.open(HeadRequest(url))
1341 new_url = response.geturl()
1342
1343 if url == new_url:
1344 return False
1345
1346 self.report_following_redirect(new_url)
1347 self._downloader.download([new_url])
1348 return True
1349
1350 def _real_extract(self, url):
1351 if self._test_redirect(url): return
1352
1353 video_id = url.split('/')[-1]
1354 request = compat_urllib_request.Request(url)
1355 try:
1356 self.report_download_webpage(video_id)
1357 webpage = compat_urllib_request.urlopen(request).read()
1358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360 return
1361 except ValueError as err:
1362 # since this is the last-resort InfoExtractor, if
1363 # this error is thrown, it'll be thrown here
1364 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365 return
1366
1367 self.report_extraction(video_id)
1368 # Start with something easy: JW Player in SWFObject
1369 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370 if mobj is None:
1371 # Broaden the search a little bit
1372 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373 if mobj is None:
1374 # Broaden the search a little bit: JWPlayer JS loader
1375 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376 if mobj is None:
1377 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378 return
1379
1380 # It's possible that one of the regexes
1381 # matched, but returned an empty group:
1382 if mobj.group(1) is None:
1383 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384 return
1385
1386 video_url = compat_urllib_parse.unquote(mobj.group(1))
1387 video_id = os.path.basename(video_url)
1388
1389 # here's a fun little line of code for you:
1390 video_extension = os.path.splitext(video_id)[1][1:]
1391 video_id = os.path.splitext(video_id)[0]
1392
1393 # it's tempting to parse this further, but you would
1394 # have to take into account all the variations like
1395 # Video Title - Site Name
1396 # Site Name | Video Title
1397 # Video Title - Tagline | Site Name
1398 # and so on and so forth; it's just not practical
1399 mobj = re.search(r'<title>(.*)</title>', webpage)
1400 if mobj is None:
1401 self._downloader.trouble(u'ERROR: unable to extract title')
1402 return
1403 video_title = mobj.group(1)
1404
1405 # video uploader is domain name
1406 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407 if mobj is None:
1408 self._downloader.trouble(u'ERROR: unable to extract title')
1409 return
1410 video_uploader = mobj.group(1)
1411
1412 return [{
1413 'id': video_id,
1414 'url': video_url,
1415 'uploader': video_uploader,
1416 'upload_date': None,
1417 'title': video_title,
1418 'ext': video_extension,
1419 }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423 """Information Extractor for YouTube search queries."""
1424 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426 _max_youtube_results = 1000
1427 IE_NAME = u'youtube:search'
1428
1429 def __init__(self, downloader=None):
1430 InfoExtractor.__init__(self, downloader)
1431
1432 def report_download_page(self, query, pagenum):
1433 """Report attempt to download search page with given number."""
1434 query = query.decode(preferredencoding())
1435 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437 def _real_extract(self, query):
1438 mobj = re.match(self._VALID_URL, query)
1439 if mobj is None:
1440 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441 return
1442
1443 prefix, query = query.split(':')
1444 prefix = prefix[8:]
1445 query = query.encode('utf-8')
1446 if prefix == '':
1447 self._download_n_results(query, 1)
1448 return
1449 elif prefix == 'all':
1450 self._download_n_results(query, self._max_youtube_results)
1451 return
1452 else:
1453 try:
1454 n = int(prefix)
1455 if n <= 0:
1456 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457 return
1458 elif n > self._max_youtube_results:
1459 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460 n = self._max_youtube_results
1461 self._download_n_results(query, n)
1462 return
1463 except ValueError: # parsing prefix as integer fails
1464 self._download_n_results(query, 1)
1465 return
1466
1467 def _download_n_results(self, query, n):
1468 """Downloads a specified number of results for a query"""
1469
1470 video_ids = []
1471 pagenum = 0
1472 limit = n
1473
1474 while (50 * pagenum) < limit:
1475 self.report_download_page(query, pagenum+1)
1476 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477 request = compat_urllib_request.Request(result_url)
1478 try:
1479 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482 return
1483 api_response = json.loads(data)['data']
1484
1485 if not 'items' in api_response:
1486 self._downloader.trouble(u'[youtube] No video results')
1487 return
1488
1489 new_ids = list(video['id'] for video in api_response['items'])
1490 video_ids += new_ids
1491
1492 limit = min(n, api_response['totalItems'])
1493 pagenum += 1
1494
1495 if len(video_ids) > n:
1496 video_ids = video_ids[:n]
1497 for id in video_ids:
1498 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499 return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503 """Information Extractor for Google Video search queries."""
1504 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508 _max_google_results = 1000
1509 IE_NAME = u'video.google:search'
1510
1511 def __init__(self, downloader=None):
1512 InfoExtractor.__init__(self, downloader)
1513
1514 def report_download_page(self, query, pagenum):
1515 """Report attempt to download playlist page with given number."""
1516 query = query.decode(preferredencoding())
1517 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519 def _real_extract(self, query):
1520 mobj = re.match(self._VALID_URL, query)
1521 if mobj is None:
1522 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523 return
1524
1525 prefix, query = query.split(':')
1526 prefix = prefix[8:]
1527 query = query.encode('utf-8')
1528 if prefix == '':
1529 self._download_n_results(query, 1)
1530 return
1531 elif prefix == 'all':
1532 self._download_n_results(query, self._max_google_results)
1533 return
1534 else:
1535 try:
1536 n = int(prefix)
1537 if n <= 0:
1538 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539 return
1540 elif n > self._max_google_results:
1541 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542 n = self._max_google_results
1543 self._download_n_results(query, n)
1544 return
1545 except ValueError: # parsing prefix as integer fails
1546 self._download_n_results(query, 1)
1547 return
1548
1549 def _download_n_results(self, query, n):
1550 """Downloads a specified number of results for a query"""
1551
1552 video_ids = []
1553 pagenum = 0
1554
1555 while True:
1556 self.report_download_page(query, pagenum)
1557 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558 request = compat_urllib_request.Request(result_url)
1559 try:
1560 page = compat_urllib_request.urlopen(request).read()
1561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563 return
1564
1565 # Extract video identifiers
1566 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567 video_id = mobj.group(1)
1568 if video_id not in video_ids:
1569 video_ids.append(video_id)
1570 if len(video_ids) == n:
1571 # Specified n videos reached
1572 for id in video_ids:
1573 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574 return
1575
1576 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577 for id in video_ids:
1578 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579 return
1580
1581 pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585 """Information Extractor for Yahoo! Video search queries."""
1586
1587 _WORKING = False
1588 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591 _MORE_PAGES_INDICATOR = r'\s*Next'
1592 _max_yahoo_results = 1000
1593 IE_NAME = u'video.yahoo:search'
1594
1595 def __init__(self, downloader=None):
1596 InfoExtractor.__init__(self, downloader)
1597
1598 def report_download_page(self, query, pagenum):
1599 """Report attempt to download playlist page with given number."""
1600 query = query.decode(preferredencoding())
1601 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603 def _real_extract(self, query):
1604 mobj = re.match(self._VALID_URL, query)
1605 if mobj is None:
1606 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607 return
1608
1609 prefix, query = query.split(':')
1610 prefix = prefix[8:]
1611 query = query.encode('utf-8')
1612 if prefix == '':
1613 self._download_n_results(query, 1)
1614 return
1615 elif prefix == 'all':
1616 self._download_n_results(query, self._max_yahoo_results)
1617 return
1618 else:
1619 try:
1620 n = int(prefix)
1621 if n <= 0:
1622 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623 return
1624 elif n > self._max_yahoo_results:
1625 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626 n = self._max_yahoo_results
1627 self._download_n_results(query, n)
1628 return
1629 except ValueError: # parsing prefix as integer fails
1630 self._download_n_results(query, 1)
1631 return
1632
1633 def _download_n_results(self, query, n):
1634 """Downloads a specified number of results for a query"""
1635
1636 video_ids = []
1637 already_seen = set()
1638 pagenum = 1
1639
1640 while True:
1641 self.report_download_page(query, pagenum)
1642 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643 request = compat_urllib_request.Request(result_url)
1644 try:
1645 page = compat_urllib_request.urlopen(request).read()
1646 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648 return
1649
1650 # Extract video identifiers
1651 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652 video_id = mobj.group(1)
1653 if video_id not in already_seen:
1654 video_ids.append(video_id)
1655 already_seen.add(video_id)
1656 if len(video_ids) == n:
1657 # Specified n videos reached
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660 return
1661
1662 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663 for id in video_ids:
1664 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665 return
1666
1667 pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671 """Information Extractor for YouTube playlists."""
1672
1673 _VALID_URL = r"""(?:
1674 (?:https?://)?
1675 (?:\w+\.)?
1676 youtube\.com/
1677 (?:
1678 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679 \? (?:.*?&)*? (?:p|a|list)=
1680 | user/.*?/user/
1681 | p/
1682 | user/.*?#[pg]/c/
1683 )
1684 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685 .*
1686 |
1687 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688 )"""
1689 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690 _MAX_RESULTS = 50
1691 IE_NAME = u'youtube:playlist'
1692
1693 def __init__(self, downloader=None):
1694 InfoExtractor.__init__(self, downloader)
1695
1696 @classmethod
1697 def suitable(cls, url):
1698 """Receives a URL and returns True if suitable for this IE."""
1699 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701 def report_download_page(self, playlist_id, pagenum):
1702 """Report attempt to download playlist page with given number."""
1703 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705 def _real_extract(self, url):
1706 # Extract playlist id
1707 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708 if mobj is None:
1709 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710 return
1711
1712 # Download playlist videos from API
1713 playlist_id = mobj.group(1) or mobj.group(2)
1714 page_num = 1
1715 videos = []
1716
1717 while True:
1718 self.report_download_page(playlist_id, page_num)
1719
1720 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721 try:
1722 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725 return
1726
1727 try:
1728 response = json.loads(page)
1729 except ValueError as err:
1730 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731 return
1732
1733 if not 'feed' in response or not 'entry' in response['feed']:
1734 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735 return
1736 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737 for entry in response['feed']['entry']
1738 if 'content' in entry ]
1739
1740 if len(response['feed']['entry']) < self._MAX_RESULTS:
1741 break
1742 page_num += 1
1743
1744 videos = [v[1] for v in sorted(videos)]
1745 total = len(videos)
1746
1747 playliststart = self._downloader.params.get('playliststart', 1) - 1
1748 playlistend = self._downloader.params.get('playlistend', -1)
1749 if playlistend == -1:
1750 videos = videos[playliststart:]
1751 else:
1752 videos = videos[playliststart:playlistend]
1753
1754 if len(videos) == total:
1755 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1756 else:
1757 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1758
1759 return self._downloader.extract_info_iterable(videos)
1760
1761
1762 class YoutubeChannelIE(InfoExtractor):
1763 """Information Extractor for YouTube channels."""
1764
1765 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1766 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1767 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1768 IE_NAME = u'youtube:channel'
1769
1770 def report_download_page(self, channel_id, pagenum):
1771 """Report attempt to download channel page with given number."""
1772 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1773
1774 def _real_extract(self, url):
1775 # Extract channel id
1776 mobj = re.match(self._VALID_URL, url)
1777 if mobj is None:
1778 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1779 return
1780
1781 # Download channel pages
1782 channel_id = mobj.group(1)
1783 video_ids = []
1784 pagenum = 1
1785
1786 while True:
1787 self.report_download_page(channel_id, pagenum)
1788 url = self._TEMPLATE_URL % (channel_id, pagenum)
1789 request = compat_urllib_request.Request(url)
1790 try:
1791 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794 return
1795
1796 # Extract video identifiers
1797 ids_in_page = []
1798 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1799 if mobj.group(1) not in ids_in_page:
1800 ids_in_page.append(mobj.group(1))
1801 video_ids.extend(ids_in_page)
1802
1803 if self._MORE_PAGES_INDICATOR not in page:
1804 break
1805 pagenum = pagenum + 1
1806
1807 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808
1809 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1810 return self._downloader.extract_info_iterable(urls)
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814 """Information Extractor for YouTube users."""
1815
1816 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818 _GDATA_PAGE_SIZE = 50
1819 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821 IE_NAME = u'youtube:user'
1822
1823 def __init__(self, downloader=None):
1824 InfoExtractor.__init__(self, downloader)
1825
1826 def report_download_page(self, username, start_index):
1827 """Report attempt to download user page."""
1828 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1829 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1830
1831 def _real_extract(self, url):
1832 # Extract username
1833 mobj = re.match(self._VALID_URL, url)
1834 if mobj is None:
1835 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1836 return
1837
1838 username = mobj.group(1)
1839
1840 # Download video ids using YouTube Data API. Result size per
1841 # query is limited (currently to 50 videos) so we need to query
1842 # page by page until there are no video ids - it means we got
1843 # all of them.
1844
1845 video_ids = []
1846 pagenum = 0
1847
1848 while True:
1849 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1850 self.report_download_page(username, start_index)
1851
1852 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1853
1854 try:
1855 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1858 return
1859
1860 # Extract video identifiers
1861 ids_in_page = []
1862
1863 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864 if mobj.group(1) not in ids_in_page:
1865 ids_in_page.append(mobj.group(1))
1866
1867 video_ids.extend(ids_in_page)
1868
1869 # A little optimization - if current page is not
1870 # "full", ie. does not contain PAGE_SIZE video ids then
1871 # we can assume that this page is the last one - there
1872 # are no more ids on further pages - no need to query
1873 # again.
1874
1875 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1876 break
1877
1878 pagenum += 1
1879
1880 all_ids_count = len(video_ids)
1881 playliststart = self._downloader.params.get('playliststart', 1) - 1
1882 playlistend = self._downloader.params.get('playlistend', -1)
1883
1884 if playlistend == -1:
1885 video_ids = video_ids[playliststart:]
1886 else:
1887 video_ids = video_ids[playliststart:playlistend]
1888
1889 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1890 (username, all_ids_count, len(video_ids)))
1891
1892 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1893 return self._downloader.extract_info_iterable(urls)
1894
1895
1896 class BlipTVUserIE(InfoExtractor):
1897 """Information Extractor for blip.tv users."""
1898
1899 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1900 _PAGE_SIZE = 12
1901 IE_NAME = u'blip.tv:user'
1902
1903 def __init__(self, downloader=None):
1904 InfoExtractor.__init__(self, downloader)
1905
1906 def report_download_page(self, username, pagenum):
1907 """Report attempt to download user page."""
1908 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1909 (self.IE_NAME, username, pagenum))
1910
1911 def _real_extract(self, url):
1912 # Extract username
1913 mobj = re.match(self._VALID_URL, url)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1916 return
1917
1918 username = mobj.group(1)
1919
1920 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1921
1922 request = compat_urllib_request.Request(url)
1923
1924 try:
1925 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926 mobj = re.search(r'data-users-id="([^"]+)"', page)
1927 page_base = page_base % mobj.group(1)
1928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1930 return
1931
1932
1933 # Download video ids using BlipTV Ajax calls. Result size per
1934 # query is limited (currently to 12 videos) so we need to query
1935 # page by page until there are no video ids - it means we got
1936 # all of them.
1937
1938 video_ids = []
1939 pagenum = 1
1940
1941 while True:
1942 self.report_download_page(username, pagenum)
1943 url = page_base + "&page=" + str(pagenum)
1944 request = compat_urllib_request.Request( url )
1945 try:
1946 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1949 return
1950
1951 # Extract video identifiers
1952 ids_in_page = []
1953
1954 for mobj in re.finditer(r'href="/([^"]+)"', page):
1955 if mobj.group(1) not in ids_in_page:
1956 ids_in_page.append(unescapeHTML(mobj.group(1)))
1957
1958 video_ids.extend(ids_in_page)
1959
1960 # A little optimization - if current page is not
1961 # "full", ie. does not contain PAGE_SIZE video ids then
1962 # we can assume that this page is the last one - there
1963 # are no more ids on further pages - no need to query
1964 # again.
1965
1966 if len(ids_in_page) < self._PAGE_SIZE:
1967 break
1968
1969 pagenum += 1
1970
1971 all_ids_count = len(video_ids)
1972 playliststart = self._downloader.params.get('playliststart', 1) - 1
1973 playlistend = self._downloader.params.get('playlistend', -1)
1974
1975 if playlistend == -1:
1976 video_ids = video_ids[playliststart:]
1977 else:
1978 video_ids = video_ids[playliststart:playlistend]
1979
1980 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1981 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982
1983 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1984 return self._downloader.extract_info_iterable(urls)
1985
1986
1987 class DepositFilesIE(InfoExtractor):
1988 """Information extractor for depositfiles.com"""
1989
1990 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991
1992 def report_download_webpage(self, file_id):
1993 """Report webpage download."""
1994 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995
1996 def report_extraction(self, file_id):
1997 """Report information extraction."""
1998 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999
2000 def _real_extract(self, url):
2001 file_id = url.split('/')[-1]
2002 # Rebuild url in english locale
2003 url = 'http://depositfiles.com/en/files/' + file_id
2004
2005 # Retrieve file webpage with 'Free download' button pressed
2006 free_download_indication = { 'gateway_result' : '1' }
2007 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008 try:
2009 self.report_download_webpage(file_id)
2010 webpage = compat_urllib_request.urlopen(request).read()
2011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2013 return
2014
2015 # Search for the real file URL
2016 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2017 if (mobj is None) or (mobj.group(1) is None):
2018 # Try to figure out reason of the error.
2019 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2020 if (mobj is not None) and (mobj.group(1) is not None):
2021 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2022 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023 else:
2024 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2025 return
2026
2027 file_url = mobj.group(1)
2028 file_extension = os.path.splitext(file_url)[1][1:]
2029
2030 # Search for file title
2031 mobj = re.search(r'<b title="(.*?)">', webpage)
2032 if mobj is None:
2033 self._downloader.trouble(u'ERROR: unable to extract title')
2034 return
2035 file_title = mobj.group(1).decode('utf-8')
2036
2037 return [{
2038 'id': file_id.decode('utf-8'),
2039 'url': file_url.decode('utf-8'),
2040 'uploader': None,
2041 'upload_date': None,
2042 'title': file_title,
2043 'ext': file_extension.decode('utf-8'),
2044 }]
2045
2046
2047 class FacebookIE(InfoExtractor):
2048 """Information Extractor for Facebook"""
2049
2050 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2051 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2052 _NETRC_MACHINE = 'facebook'
2053 IE_NAME = u'facebook'
2054
2055 def report_login(self):
2056 """Report attempt to log in."""
2057 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058
2059 def _real_initialize(self):
2060 if self._downloader is None:
2061 return
2062
2063 useremail = None
2064 password = None
2065 downloader_params = self._downloader.params
2066
2067 # Attempt to use provided username and password or .netrc data
2068 if downloader_params.get('username', None) is not None:
2069 useremail = downloader_params['username']
2070 password = downloader_params['password']
2071 elif downloader_params.get('usenetrc', False):
2072 try:
2073 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2074 if info is not None:
2075 useremail = info[0]
2076 password = info[2]
2077 else:
2078 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2079 except (IOError, netrc.NetrcParseError) as err:
2080 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2081 return
2082
2083 if useremail is None:
2084 return
2085
2086 # Log in
2087 login_form = {
2088 'email': useremail,
2089 'pass': password,
2090 'login': 'Log+In'
2091 }
2092 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2093 try:
2094 self.report_login()
2095 login_results = compat_urllib_request.urlopen(request).read()
2096 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2097 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098 return
2099 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2100 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2101 return
2102
2103 def _real_extract(self, url):
2104 mobj = re.match(self._VALID_URL, url)
2105 if mobj is None:
2106 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107 return
2108 video_id = mobj.group('ID')
2109
2110 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2111 webpage = self._download_webpage(url, video_id)
2112
2113 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2114 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2115 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116 if not m:
2117 raise ExtractorError(u'Cannot parse data')
2118 data = dict(json.loads(m.group(1)))
2119 params_raw = compat_urllib_parse.unquote(data['params'])
2120 params = json.loads(params_raw)
2121 video_url = params['hd_src']
2122 if not video_url:
2123 video_url = params['sd_src']
2124 if not video_url:
2125 raise ExtractorError(u'Cannot find video URL')
2126 video_duration = int(params['video_duration'])
2127
2128 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129 if not m:
2130 raise ExtractorError(u'Cannot find title in webpage')
2131 video_title = unescapeHTML(m.group(1))
2132
2133 info = {
2134 'id': video_id,
2135 'title': video_title,
2136 'url': video_url,
2137 'ext': 'mp4',
2138 'duration': video_duration,
2139 'thumbnail': params['thumbnail_src'],
2140 }
2141 return [info]
2142
2143
2144 class BlipTVIE(InfoExtractor):
2145 """Information extractor for blip.tv"""
2146
2147 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149 IE_NAME = u'blip.tv'
2150
2151 def report_extraction(self, file_id):
2152 """Report information extraction."""
2153 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155 def report_direct_download(self, title):
2156 """Report information extraction."""
2157 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159 def _real_extract(self, url):
2160 mobj = re.match(self._VALID_URL, url)
2161 if mobj is None:
2162 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163 return
2164
2165 urlp = compat_urllib_parse_urlparse(url)
2166 if urlp.path.startswith('/play/'):
2167 request = compat_urllib_request.Request(url)
2168 response = compat_urllib_request.urlopen(request)
2169 redirecturl = response.geturl()
2170 rurlp = compat_urllib_parse_urlparse(redirecturl)
2171 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2172 url = 'http://blip.tv/a/a-' + file_id
2173 return self._real_extract(url)
2174
2175
2176 if '?' in url:
2177 cchar = '&'
2178 else:
2179 cchar = '?'
2180 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2181 request = compat_urllib_request.Request(json_url)
2182 request.add_header('User-Agent', 'iTunes/10.6.1')
2183 self.report_extraction(mobj.group(1))
2184 info = None
2185 try:
2186 urlh = compat_urllib_request.urlopen(request)
2187 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2188 basename = url.split('/')[-1]
2189 title,ext = os.path.splitext(basename)
2190 title = title.decode('UTF-8')
2191 ext = ext.replace('.', '')
2192 self.report_direct_download(title)
2193 info = {
2194 'id': title,
2195 'url': url,
2196 'uploader': None,
2197 'upload_date': None,
2198 'title': title,
2199 'ext': ext,
2200 'urlhandle': urlh
2201 }
2202 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2203 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2204 if info is None: # Regular URL
2205 try:
2206 json_code_bytes = urlh.read()
2207 json_code = json_code_bytes.decode('utf-8')
2208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2209 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2210 return
2211
2212 try:
2213 json_data = json.loads(json_code)
2214 if 'Post' in json_data:
2215 data = json_data['Post']
2216 else:
2217 data = json_data
2218
2219 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2220 video_url = data['media']['url']
2221 umobj = re.match(self._URL_EXT, video_url)
2222 if umobj is None:
2223 raise ValueError('Can not determine filename extension')
2224 ext = umobj.group(1)
2225
2226 info = {
2227 'id': data['item_id'],
2228 'url': video_url,
2229 'uploader': data['display_name'],
2230 'upload_date': upload_date,
2231 'title': data['title'],
2232 'ext': ext,
2233 'format': data['media']['mimeType'],
2234 'thumbnail': data['thumbnailUrl'],
2235 'description': data['description'],
2236 'player_url': data['embedUrl'],
2237 'user_agent': 'iTunes/10.6.1',
2238 }
2239 except (ValueError,KeyError) as err:
2240 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2241 return
2242
2243 return [info]
2244
2245
2246 class MyVideoIE(InfoExtractor):
2247 """Information Extractor for myvideo.de."""
2248
2249 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2250 IE_NAME = u'myvideo'
2251
2252 def __init__(self, downloader=None):
2253 InfoExtractor.__init__(self, downloader)
2254
2255 def report_extraction(self, video_id):
2256 """Report information extraction."""
2257 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2258
2259 def _real_extract(self,url):
2260 mobj = re.match(self._VALID_URL, url)
2261 if mobj is None:
2262 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2263 return
2264
2265 video_id = mobj.group(1)
2266
2267 # Get video webpage
2268 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2269 webpage = self._download_webpage(webpage_url, video_id)
2270
2271 self.report_extraction(video_id)
2272 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2273 webpage)
2274 if mobj is None:
2275 self._downloader.trouble(u'ERROR: unable to extract media URL')
2276 return
2277 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2278
2279 mobj = re.search('<title>([^<]+)</title>', webpage)
2280 if mobj is None:
2281 self._downloader.trouble(u'ERROR: unable to extract title')
2282 return
2283
2284 video_title = mobj.group(1)
2285
2286 return [{
2287 'id': video_id,
2288 'url': video_url,
2289 'uploader': None,
2290 'upload_date': None,
2291 'title': video_title,
2292 'ext': u'flv',
2293 }]
2294
2295 class ComedyCentralIE(InfoExtractor):
2296 """Information extractor for The Daily Show and Colbert Report """
2297
2298 # urls can be abbreviations like :thedailyshow or :colbert
2299 # urls for episodes like:
2300 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2301 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2302 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2303 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2304 |(https?://)?(www\.)?
2305 (?P<showname>thedailyshow|colbertnation)\.com/
2306 (full-episodes/(?P<episode>.*)|
2307 (?P<clip>
2308 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2309 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2310 $"""
2311
2312 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2313
2314 _video_extensions = {
2315 '3500': 'mp4',
2316 '2200': 'mp4',
2317 '1700': 'mp4',
2318 '1200': 'mp4',
2319 '750': 'mp4',
2320 '400': 'mp4',
2321 }
2322 _video_dimensions = {
2323 '3500': '1280x720',
2324 '2200': '960x540',
2325 '1700': '768x432',
2326 '1200': '640x360',
2327 '750': '512x288',
2328 '400': '384x216',
2329 }
2330
2331 @classmethod
2332 def suitable(cls, url):
2333 """Receives a URL and returns True if suitable for this IE."""
2334 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2335
2336 def report_extraction(self, episode_id):
2337 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2338
2339 def report_config_download(self, episode_id, media_id):
2340 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2341
2342 def report_index_download(self, episode_id):
2343 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2344
2345 def _print_formats(self, formats):
2346 print('Available formats:')
2347 for x in formats:
2348 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2349
2350
2351 def _real_extract(self, url):
2352 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2353 if mobj is None:
2354 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2355 return
2356
2357 if mobj.group('shortname'):
2358 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2359 url = u'http://www.thedailyshow.com/full-episodes/'
2360 else:
2361 url = u'http://www.colbertnation.com/full-episodes/'
2362 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2363 assert mobj is not None
2364
2365 if mobj.group('clip'):
2366 if mobj.group('showname') == 'thedailyshow':
2367 epTitle = mobj.group('tdstitle')
2368 else:
2369 epTitle = mobj.group('cntitle')
2370 dlNewest = False
2371 else:
2372 dlNewest = not mobj.group('episode')
2373 if dlNewest:
2374 epTitle = mobj.group('showname')
2375 else:
2376 epTitle = mobj.group('episode')
2377
2378 req = compat_urllib_request.Request(url)
2379 self.report_extraction(epTitle)
2380 try:
2381 htmlHandle = compat_urllib_request.urlopen(req)
2382 html = htmlHandle.read()
2383 webpage = html.decode('utf-8')
2384 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2385 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2386 return
2387 if dlNewest:
2388 url = htmlHandle.geturl()
2389 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2390 if mobj is None:
2391 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2392 return
2393 if mobj.group('episode') == '':
2394 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2395 return
2396 epTitle = mobj.group('episode')
2397
2398 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2399
2400 if len(mMovieParams) == 0:
2401 # The Colbert Report embeds the information in a without
2402 # a URL prefix; so extract the alternate reference
2403 # and then add the URL prefix manually.
2404
2405 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2406 if len(altMovieParams) == 0:
2407 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2408 return
2409 else:
2410 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2411
2412 uri = mMovieParams[0][1]
2413 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2414 self.report_index_download(epTitle)
2415 try:
2416 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2417 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2418 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2419 return
2420
2421 results = []
2422
2423 idoc = xml.etree.ElementTree.fromstring(indexXml)
2424 itemEls = idoc.findall('.//item')
2425 for partNum,itemEl in enumerate(itemEls):
2426 mediaId = itemEl.findall('./guid')[0].text
2427 shortMediaId = mediaId.split(':')[-1]
2428 showId = mediaId.split(':')[-2].replace('.com', '')
2429 officialTitle = itemEl.findall('./title')[0].text
2430 officialDate = itemEl.findall('./pubDate')[0].text
2431
2432 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2433 compat_urllib_parse.urlencode({'uri': mediaId}))
2434 configReq = compat_urllib_request.Request(configUrl)
2435 self.report_config_download(epTitle, shortMediaId)
2436 try:
2437 configXml = compat_urllib_request.urlopen(configReq).read()
2438 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2439 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2440 return
2441
2442 cdoc = xml.etree.ElementTree.fromstring(configXml)
2443 turls = []
2444 for rendition in cdoc.findall('.//rendition'):
2445 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2446 turls.append(finfo)
2447
2448 if len(turls) == 0:
2449 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2450 continue
2451
2452 if self._downloader.params.get('listformats', None):
2453 self._print_formats([i[0] for i in turls])
2454 return
2455
2456 # For now, just pick the highest bitrate
2457 format,rtmp_video_url = turls[-1]
2458
2459 # Get the format arg from the arg stream
2460 req_format = self._downloader.params.get('format', None)
2461
2462 # Select format if we can find one
2463 for f,v in turls:
2464 if f == req_format:
2465 format, rtmp_video_url = f, v
2466 break
2467
2468 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2469 if not m:
2470 raise ExtractorError(u'Cannot transform RTMP url')
2471 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2472 video_url = base + m.group('finalid')
2473
2474 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2475 info = {
2476 'id': shortMediaId,
2477 'url': video_url,
2478 'uploader': showId,
2479 'upload_date': officialDate,
2480 'title': effTitle,
2481 'ext': 'mp4',
2482 'format': format,
2483 'thumbnail': None,
2484 'description': officialTitle,
2485 }
2486 results.append(info)
2487
2488 return results
2489
2490
2491 class EscapistIE(InfoExtractor):
2492 """Information extractor for The Escapist """
2493
2494 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2495 IE_NAME = u'escapist'
2496
2497 def report_extraction(self, showName):
2498 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2499
2500 def report_config_download(self, showName):
2501 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2502
2503 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2505 if mobj is None:
2506 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507 return
2508 showName = mobj.group('showname')
2509 videoId = mobj.group('episode')
2510
2511 self.report_extraction(showName)
2512 try:
2513 webPage = compat_urllib_request.urlopen(url)
2514 webPageBytes = webPage.read()
2515 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2516 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2517 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2518 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2519 return
2520
2521 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2522 description = unescapeHTML(descMatch.group(1))
2523 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2524 imgUrl = unescapeHTML(imgMatch.group(1))
2525 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2526 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2527 configUrlMatch = re.search('config=(.*)$', playerUrl)
2528 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2529
2530 self.report_config_download(showName)
2531 try:
2532 configJSON = compat_urllib_request.urlopen(configUrl)
2533 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2534 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2535 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2536 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2537 return
2538
2539 # Technically, it's JavaScript, not JSON
2540 configJSON = configJSON.replace("'", '"')
2541
2542 try:
2543 config = json.loads(configJSON)
2544 except (ValueError,) as err:
2545 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2546 return
2547
2548 playlist = config['playlist']
2549 videoUrl = playlist[1]['url']
2550
2551 info = {
2552 'id': videoId,
2553 'url': videoUrl,
2554 'uploader': showName,
2555 'upload_date': None,
2556 'title': showName,
2557 'ext': 'flv',
2558 'thumbnail': imgUrl,
2559 'description': description,
2560 'player_url': playerUrl,
2561 }
2562
2563 return [info]
2564
2565 class CollegeHumorIE(InfoExtractor):
2566 """Information extractor for collegehumor.com"""
2567
2568 _WORKING = False
2569 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2570 IE_NAME = u'collegehumor'
2571
2572 def report_manifest(self, video_id):
2573 """Report information extraction."""
2574 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2575
2576 def report_extraction(self, video_id):
2577 """Report information extraction."""
2578 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2579
2580 def _real_extract(self, url):
2581 mobj = re.match(self._VALID_URL, url)
2582 if mobj is None:
2583 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2584 return
2585 video_id = mobj.group('videoid')
2586
2587 info = {
2588 'id': video_id,
2589 'uploader': None,
2590 'upload_date': None,
2591 }
2592
2593 self.report_extraction(video_id)
2594 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2595 try:
2596 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2598 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2599 return
2600
2601 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2602 try:
2603 videoNode = mdoc.findall('./video')[0]
2604 info['description'] = videoNode.findall('./description')[0].text
2605 info['title'] = videoNode.findall('./caption')[0].text
2606 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2607 manifest_url = videoNode.findall('./file')[0].text
2608 except IndexError:
2609 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2610 return
2611
2612 manifest_url += '?hdcore=2.10.3'
2613 self.report_manifest(video_id)
2614 try:
2615 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2616 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2617 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2618 return
2619
2620 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2621 try:
2622 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2623 node_id = media_node.attrib['url']
2624 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2625 except IndexError as err:
2626 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2627 return
2628
2629 url_pr = compat_urllib_parse_urlparse(manifest_url)
2630 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2631
2632 info['url'] = url
2633 info['ext'] = 'f4f'
2634 return [info]
2635
2636
2637 class XVideosIE(InfoExtractor):
2638 """Information extractor for xvideos.com"""
2639
2640 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2641 IE_NAME = u'xvideos'
2642
2643 def report_extraction(self, video_id):
2644 """Report information extraction."""
2645 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2646
2647 def _real_extract(self, url):
2648 mobj = re.match(self._VALID_URL, url)
2649 if mobj is None:
2650 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2651 return
2652 video_id = mobj.group(1)
2653
2654 webpage = self._download_webpage(url, video_id)
2655
2656 self.report_extraction(video_id)
2657
2658
2659 # Extract video URL
2660 mobj = re.search(r'flv_url=(.+?)&', webpage)
2661 if mobj is None:
2662 self._downloader.trouble(u'ERROR: unable to extract video url')
2663 return
2664 video_url = compat_urllib_parse.unquote(mobj.group(1))
2665
2666
2667 # Extract title
2668 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2669 if mobj is None:
2670 self._downloader.trouble(u'ERROR: unable to extract video title')
2671 return
2672 video_title = mobj.group(1)
2673
2674
2675 # Extract video thumbnail
2676 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2677 if mobj is None:
2678 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2679 return
2680 video_thumbnail = mobj.group(0)
2681
2682 info = {
2683 'id': video_id,
2684 'url': video_url,
2685 'uploader': None,
2686 'upload_date': None,
2687 'title': video_title,
2688 'ext': 'flv',
2689 'thumbnail': video_thumbnail,
2690 'description': None,
2691 }
2692
2693 return [info]
2694
2695
2696 class SoundcloudIE(InfoExtractor):
2697 """Information extractor for soundcloud.com
2698 To access the media, the uid of the song and a stream token
2699 must be extracted from the page source and the script must make
2700 a request to media.soundcloud.com/crossdomain.xml. Then
2701 the media can be grabbed by requesting from an url composed
2702 of the stream token and uid
2703 """
2704
2705 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2706 IE_NAME = u'soundcloud'
2707
2708 def __init__(self, downloader=None):
2709 InfoExtractor.__init__(self, downloader)
2710
2711 def report_resolve(self, video_id):
2712 """Report information extraction."""
2713 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2714
2715 def report_extraction(self, video_id):
2716 """Report information extraction."""
2717 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2718
2719 def _real_extract(self, url):
2720 mobj = re.match(self._VALID_URL, url)
2721 if mobj is None:
2722 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2723 return
2724
2725 # extract uploader (which is in the url)
2726 uploader = mobj.group(1)
2727 # extract simple title (uploader + slug of song title)
2728 slug_title = mobj.group(2)
2729 simple_title = uploader + u'-' + slug_title
2730
2731 self.report_resolve('%s/%s' % (uploader, slug_title))
2732
2733 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2734 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2735 request = compat_urllib_request.Request(resolv_url)
2736 try:
2737 info_json_bytes = compat_urllib_request.urlopen(request).read()
2738 info_json = info_json_bytes.decode('utf-8')
2739 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2740 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2741 return
2742
2743 info = json.loads(info_json)
2744 video_id = info['id']
2745 self.report_extraction('%s/%s' % (uploader, slug_title))
2746
2747 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2748 request = compat_urllib_request.Request(streams_url)
2749 try:
2750 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2751 stream_json = stream_json_bytes.decode('utf-8')
2752 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2753 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2754 return
2755
2756 streams = json.loads(stream_json)
2757 mediaURL = streams['http_mp3_128_url']
2758
2759 return [{
2760 'id': info['id'],
2761 'url': mediaURL,
2762 'uploader': info['user']['username'],
2763 'upload_date': info['created_at'],
2764 'title': info['title'],
2765 'ext': u'mp3',
2766 'description': info['description'],
2767 }]
2768
2769
2770 class InfoQIE(InfoExtractor):
2771 """Information extractor for infoq.com"""
2772 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2773
2774 def report_extraction(self, video_id):
2775 """Report information extraction."""
2776 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2777
2778 def _real_extract(self, url):
2779 mobj = re.match(self._VALID_URL, url)
2780 if mobj is None:
2781 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2782 return
2783
2784 webpage = self._download_webpage(url, video_id=url)
2785 self.report_extraction(url)
2786
2787 # Extract video URL
2788 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2789 if mobj is None:
2790 self._downloader.trouble(u'ERROR: unable to extract video url')
2791 return
2792 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2793 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2794
2795 # Extract title
2796 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2797 if mobj is None:
2798 self._downloader.trouble(u'ERROR: unable to extract video title')
2799 return
2800 video_title = mobj.group(1)
2801
2802 # Extract description
2803 video_description = u'No description available.'
2804 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2805 if mobj is not None:
2806 video_description = mobj.group(1)
2807
2808 video_filename = video_url.split('/')[-1]
2809 video_id, extension = video_filename.split('.')
2810
2811 info = {
2812 'id': video_id,
2813 'url': video_url,
2814 'uploader': None,
2815 'upload_date': None,
2816 'title': video_title,
2817 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2818 'thumbnail': None,
2819 'description': video_description,
2820 }
2821
2822 return [info]
2823
2824 class MixcloudIE(InfoExtractor):
2825 """Information extractor for www.mixcloud.com"""
2826
2827 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2828 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2829 IE_NAME = u'mixcloud'
2830
2831 def __init__(self, downloader=None):
2832 InfoExtractor.__init__(self, downloader)
2833
2834 def report_download_json(self, file_id):
2835 """Report JSON download."""
2836 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2837
2838 def report_extraction(self, file_id):
2839 """Report information extraction."""
2840 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2841
2842 def get_urls(self, jsonData, fmt, bitrate='best'):
2843 """Get urls from 'audio_formats' section in json"""
2844 file_url = None
2845 try:
2846 bitrate_list = jsonData[fmt]
2847 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2848 bitrate = max(bitrate_list) # select highest
2849
2850 url_list = jsonData[fmt][bitrate]
2851 except TypeError: # we have no bitrate info.
2852 url_list = jsonData[fmt]
2853 return url_list
2854
2855 def check_urls(self, url_list):
2856 """Returns 1st active url from list"""
2857 for url in url_list:
2858 try:
2859 compat_urllib_request.urlopen(url)
2860 return url
2861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2862 url = None
2863
2864 return None
2865
2866 def _print_formats(self, formats):
2867 print('Available formats:')
2868 for fmt in formats.keys():
2869 for b in formats[fmt]:
2870 try:
2871 ext = formats[fmt][b][0]
2872 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2873 except TypeError: # we have no bitrate info
2874 ext = formats[fmt][0]
2875 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2876 break
2877
2878 def _real_extract(self, url):
2879 mobj = re.match(self._VALID_URL, url)
2880 if mobj is None:
2881 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2882 return
2883 # extract uploader & filename from url
2884 uploader = mobj.group(1).decode('utf-8')
2885 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2886
2887 # construct API request
2888 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2889 # retrieve .json file with links to files
2890 request = compat_urllib_request.Request(file_url)
2891 try:
2892 self.report_download_json(file_url)
2893 jsonData = compat_urllib_request.urlopen(request).read()
2894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2896 return
2897
2898 # parse JSON
2899 json_data = json.loads(jsonData)
2900 player_url = json_data['player_swf_url']
2901 formats = dict(json_data['audio_formats'])
2902
2903 req_format = self._downloader.params.get('format', None)
2904 bitrate = None
2905
2906 if self._downloader.params.get('listformats', None):
2907 self._print_formats(formats)
2908 return
2909
2910 if req_format is None or req_format == 'best':
2911 for format_param in formats.keys():
2912 url_list = self.get_urls(formats, format_param)
2913 # check urls
2914 file_url = self.check_urls(url_list)
2915 if file_url is not None:
2916 break # got it!
2917 else:
2918 if req_format not in formats:
2919 self._downloader.trouble(u'ERROR: format is not available')
2920 return
2921
2922 url_list = self.get_urls(formats, req_format)
2923 file_url = self.check_urls(url_list)
2924 format_param = req_format
2925
2926 return [{
2927 'id': file_id.decode('utf-8'),
2928 'url': file_url.decode('utf-8'),
2929 'uploader': uploader.decode('utf-8'),
2930 'upload_date': None,
2931 'title': json_data['name'],
2932 'ext': file_url.split('.')[-1].decode('utf-8'),
2933 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2934 'thumbnail': json_data['thumbnail_url'],
2935 'description': json_data['description'],
2936 'player_url': player_url.decode('utf-8'),
2937 }]
2938
2939 class StanfordOpenClassroomIE(InfoExtractor):
2940 """Information extractor for Stanford's Open ClassRoom"""
2941
2942 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2943 IE_NAME = u'stanfordoc'
2944
2945 def report_download_webpage(self, objid):
2946 """Report information extraction."""
2947 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2948
2949 def report_extraction(self, video_id):
2950 """Report information extraction."""
2951 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2952
2953 def _real_extract(self, url):
2954 mobj = re.match(self._VALID_URL, url)
2955 if mobj is None:
2956 raise ExtractorError(u'Invalid URL: %s' % url)
2957
2958 if mobj.group('course') and mobj.group('video'): # A specific video
2959 course = mobj.group('course')
2960 video = mobj.group('video')
2961 info = {
2962 'id': course + '_' + video,
2963 'uploader': None,
2964 'upload_date': None,
2965 }
2966
2967 self.report_extraction(info['id'])
2968 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2969 xmlUrl = baseUrl + video + '.xml'
2970 try:
2971 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2972 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2973 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2974 return
2975 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2976 try:
2977 info['title'] = mdoc.findall('./title')[0].text
2978 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2979 except IndexError:
2980 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2981 return
2982 info['ext'] = info['url'].rpartition('.')[2]
2983 return [info]
2984 elif mobj.group('course'): # A course page
2985 course = mobj.group('course')
2986 info = {
2987 'id': course,
2988 'type': 'playlist',
2989 'uploader': None,
2990 'upload_date': None,
2991 }
2992
2993 coursepage = self._download_webpage(url, info['id'],
2994 note='Downloading course info page',
2995 errnote='Unable to download course info page')
2996
2997 m = re.search('<h1>([^<]+)</h1>', coursepage)
2998 if m:
2999 info['title'] = unescapeHTML(m.group(1))
3000 else:
3001 info['title'] = info['id']
3002
3003 m = re.search('<description>([^<]+)</description>', coursepage)
3004 if m:
3005 info['description'] = unescapeHTML(m.group(1))
3006
3007 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3008 info['list'] = [
3009 {
3010 'type': 'reference',
3011 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3012 }
3013 for vpage in links]
3014 results = []
3015 for entry in info['list']:
3016 assert entry['type'] == 'reference'
3017 results += self.extract(entry['url'])
3018 return results
3019 else: # Root page
3020 info = {
3021 'id': 'Stanford OpenClassroom',
3022 'type': 'playlist',
3023 'uploader': None,
3024 'upload_date': None,
3025 }
3026
3027 self.report_download_webpage(info['id'])
3028 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3029 try:
3030 rootpage = compat_urllib_request.urlopen(rootURL).read()
3031 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3033 return
3034
3035 info['title'] = info['id']
3036
3037 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3038 info['list'] = [
3039 {
3040 'type': 'reference',
3041 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3042 }
3043 for cpage in links]
3044
3045 results = []
3046 for entry in info['list']:
3047 assert entry['type'] == 'reference'
3048 results += self.extract(entry['url'])
3049 return results
3050
3051 class MTVIE(InfoExtractor):
3052 """Information extractor for MTV.com"""
3053
3054 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3055 IE_NAME = u'mtv'
3056
3057 def report_extraction(self, video_id):
3058 """Report information extraction."""
3059 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3060
3061 def _real_extract(self, url):
3062 mobj = re.match(self._VALID_URL, url)
3063 if mobj is None:
3064 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3065 return
3066 if not mobj.group('proto'):
3067 url = 'http://' + url
3068 video_id = mobj.group('videoid')
3069
3070 webpage = self._download_webpage(url, video_id)
3071
3072 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3073 if mobj is None:
3074 self._downloader.trouble(u'ERROR: unable to extract song name')
3075 return
3076 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3077 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3078 if mobj is None:
3079 self._downloader.trouble(u'ERROR: unable to extract performer')
3080 return
3081 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082 video_title = performer + ' - ' + song_name
3083
3084 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3085 if mobj is None:
3086 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3087 return
3088 mtvn_uri = mobj.group(1)
3089
3090 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3091 if mobj is None:
3092 self._downloader.trouble(u'ERROR: unable to extract content id')
3093 return
3094 content_id = mobj.group(1)
3095
3096 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3097 self.report_extraction(video_id)
3098 request = compat_urllib_request.Request(videogen_url)
3099 try:
3100 metadataXml = compat_urllib_request.urlopen(request).read()
3101 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3102 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3103 return
3104
3105 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3106 renditions = mdoc.findall('.//rendition')
3107
3108 # For now, always pick the highest quality.
3109 rendition = renditions[-1]
3110
3111 try:
3112 _,_,ext = rendition.attrib['type'].partition('/')
3113 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3114 video_url = rendition.find('./src').text
3115 except KeyError:
3116 self._downloader.trouble('Invalid rendition field.')
3117 return
3118
3119 info = {
3120 'id': video_id,
3121 'url': video_url,
3122 'uploader': performer,
3123 'upload_date': None,
3124 'title': video_title,
3125 'ext': ext,
3126 'format': format,
3127 }
3128
3129 return [info]
3130
3131
3132 class YoukuIE(InfoExtractor):
3133 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3134
3135 def report_download_webpage(self, file_id):
3136 """Report webpage download."""
3137 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3138
3139 def report_extraction(self, file_id):
3140 """Report information extraction."""
3141 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3142
3143 def _gen_sid(self):
3144 nowTime = int(time.time() * 1000)
3145 random1 = random.randint(1000,1998)
3146 random2 = random.randint(1000,9999)
3147
3148 return "%d%d%d" %(nowTime,random1,random2)
3149
3150 def _get_file_ID_mix_string(self, seed):
3151 mixed = []
3152 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3153 seed = float(seed)
3154 for i in range(len(source)):
3155 seed = (seed * 211 + 30031 ) % 65536
3156 index = math.floor(seed / 65536 * len(source) )
3157 mixed.append(source[int(index)])
3158 source.remove(source[int(index)])
3159 #return ''.join(mixed)
3160 return mixed
3161
3162 def _get_file_id(self, fileId, seed):
3163 mixed = self._get_file_ID_mix_string(seed)
3164 ids = fileId.split('*')
3165 realId = []
3166 for ch in ids:
3167 if ch:
3168 realId.append(mixed[int(ch)])
3169 return ''.join(realId)
3170
3171 def _real_extract(self, url):
3172 mobj = re.match(self._VALID_URL, url)
3173 if mobj is None:
3174 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3175 return
3176 video_id = mobj.group('ID')
3177
3178 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3179
3180 request = compat_urllib_request.Request(info_url, None, std_headers)
3181 try:
3182 self.report_download_webpage(video_id)
3183 jsondata = compat_urllib_request.urlopen(request).read()
3184 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3185 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3186 return
3187
3188 self.report_extraction(video_id)
3189 try:
3190 jsonstr = jsondata.decode('utf-8')
3191 config = json.loads(jsonstr)
3192
3193 video_title = config['data'][0]['title']
3194 seed = config['data'][0]['seed']
3195
3196 format = self._downloader.params.get('format', None)
3197 supported_format = list(config['data'][0]['streamfileids'].keys())
3198
3199 if format is None or format == 'best':
3200 if 'hd2' in supported_format:
3201 format = 'hd2'
3202 else:
3203 format = 'flv'
3204 ext = u'flv'
3205 elif format == 'worst':
3206 format = 'mp4'
3207 ext = u'mp4'
3208 else:
3209 format = 'flv'
3210 ext = u'flv'
3211
3212
3213 fileid = config['data'][0]['streamfileids'][format]
3214 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3215 except (UnicodeDecodeError, ValueError, KeyError):
3216 self._downloader.trouble(u'ERROR: unable to extract info section')
3217 return
3218
3219 files_info=[]
3220 sid = self._gen_sid()
3221 fileid = self._get_file_id(fileid, seed)
3222
3223 #column 8,9 of fileid represent the segment number
3224 #fileid[7:9] should be changed
3225 for index, key in enumerate(keys):
3226
3227 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3228 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3229
3230 info = {
3231 'id': '%s_part%02d' % (video_id, index),
3232 'url': download_url,
3233 'uploader': None,
3234 'upload_date': None,
3235 'title': video_title,
3236 'ext': ext,
3237 }
3238 files_info.append(info)
3239
3240 return files_info
3241
3242
3243 class XNXXIE(InfoExtractor):
3244 """Information extractor for xnxx.com"""
3245
3246 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3247 IE_NAME = u'xnxx'
3248 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3249 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3250 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3251
3252 def report_webpage(self, video_id):
3253 """Report information extraction"""
3254 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3255
3256 def report_extraction(self, video_id):
3257 """Report information extraction"""
3258 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3259
3260 def _real_extract(self, url):
3261 mobj = re.match(self._VALID_URL, url)
3262 if mobj is None:
3263 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3264 return
3265 video_id = mobj.group(1)
3266
3267 self.report_webpage(video_id)
3268
3269 # Get webpage content
3270 try:
3271 webpage_bytes = compat_urllib_request.urlopen(url).read()
3272 webpage = webpage_bytes.decode('utf-8')
3273 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3274 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3275 return
3276
3277 result = re.search(self.VIDEO_URL_RE, webpage)
3278 if result is None:
3279 self._downloader.trouble(u'ERROR: unable to extract video url')
3280 return
3281 video_url = compat_urllib_parse.unquote(result.group(1))
3282
3283 result = re.search(self.VIDEO_TITLE_RE, webpage)
3284 if result is None:
3285 self._downloader.trouble(u'ERROR: unable to extract video title')
3286 return
3287 video_title = result.group(1)
3288
3289 result = re.search(self.VIDEO_THUMB_RE, webpage)
3290 if result is None:
3291 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3292 return
3293 video_thumbnail = result.group(1)
3294
3295 return [{
3296 'id': video_id,
3297 'url': video_url,
3298 'uploader': None,
3299 'upload_date': None,
3300 'title': video_title,
3301 'ext': 'flv',
3302 'thumbnail': video_thumbnail,
3303 'description': None,
3304 }]
3305
3306
3307 class GooglePlusIE(InfoExtractor):
3308 """Information extractor for plus.google.com."""
3309
3310 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3311 IE_NAME = u'plus.google'
3312
3313 def __init__(self, downloader=None):
3314 InfoExtractor.__init__(self, downloader)
3315
3316 def report_extract_entry(self, url):
3317 """Report downloading extry"""
3318 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3319
3320 def report_date(self, upload_date):
3321 """Report downloading extry"""
3322 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3323
3324 def report_uploader(self, uploader):
3325 """Report downloading extry"""
3326 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3327
3328 def report_title(self, video_title):
3329 """Report downloading extry"""
3330 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3331
3332 def report_extract_vid_page(self, video_page):
3333 """Report information extraction."""
3334 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3335
3336 def _real_extract(self, url):
3337 # Extract id from URL
3338 mobj = re.match(self._VALID_URL, url)
3339 if mobj is None:
3340 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3341 return
3342
3343 post_url = mobj.group(0)
3344 video_id = mobj.group(1)
3345
3346 video_extension = 'flv'
3347
3348 # Step 1, Retrieve post webpage to extract further information
3349 self.report_extract_entry(post_url)
3350 request = compat_urllib_request.Request(post_url)
3351 try:
3352 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3354 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3355 return
3356
3357 # Extract update date
3358 upload_date = None
3359 pattern = 'title="Timestamp">(.*?)</a>'
3360 mobj = re.search(pattern, webpage)
3361 if mobj:
3362 upload_date = mobj.group(1)
3363 # Convert timestring to a format suitable for filename
3364 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3365 upload_date = upload_date.strftime('%Y%m%d')
3366 self.report_date(upload_date)
3367
3368 # Extract uploader
3369 uploader = None
3370 pattern = r'rel\="author".*?>(.*?)</a>'
3371 mobj = re.search(pattern, webpage)
3372 if mobj:
3373 uploader = mobj.group(1)
3374 self.report_uploader(uploader)
3375
3376 # Extract title
3377 # Get the first line for title
3378 video_title = u'NA'
3379 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3380 mobj = re.search(pattern, webpage)
3381 if mobj:
3382 video_title = mobj.group(1)
3383 self.report_title(video_title)
3384
3385 # Step 2, Stimulate clicking the image box to launch video
3386 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3387 mobj = re.search(pattern, webpage)
3388 if mobj is None:
3389 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3390
3391 video_page = mobj.group(1)
3392 request = compat_urllib_request.Request(video_page)
3393 try:
3394 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3396 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3397 return
3398 self.report_extract_vid_page(video_page)
3399
3400
3401 # Extract video links on video page
3402 """Extract video links of all sizes"""
3403 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3404 mobj = re.findall(pattern, webpage)
3405 if len(mobj) == 0:
3406 self._downloader.trouble(u'ERROR: unable to extract video links')
3407
3408 # Sort in resolution
3409 links = sorted(mobj)
3410
3411 # Choose the lowest of the sort, i.e. highest resolution
3412 video_url = links[-1]
3413 # Only get the url. The resolution part in the tuple has no use anymore
3414 video_url = video_url[-1]
3415 # Treat escaped \u0026 style hex
3416 try:
3417 video_url = video_url.decode("unicode_escape")
3418 except AttributeError: # Python 3
3419 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3420
3421
3422 return [{
3423 'id': video_id,
3424 'url': video_url,
3425 'uploader': uploader,
3426 'upload_date': upload_date,
3427 'title': video_title,
3428 'ext': video_extension,
3429 }]
3430
3431 class NBAIE(InfoExtractor):
3432 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3433 IE_NAME = u'nba'
3434
3435 def _real_extract(self, url):
3436 mobj = re.match(self._VALID_URL, url)
3437 if mobj is None:
3438 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3439 return
3440
3441 video_id = mobj.group(1)
3442 if video_id.endswith('/index.html'):
3443 video_id = video_id[:-len('/index.html')]
3444
3445 webpage = self._download_webpage(url, video_id)
3446
3447 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3448 def _findProp(rexp, default=None):
3449 m = re.search(rexp, webpage)
3450 if m:
3451 return unescapeHTML(m.group(1))
3452 else:
3453 return default
3454
3455 shortened_video_id = video_id.rpartition('/')[2]
3456 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3457 info = {
3458 'id': shortened_video_id,
3459 'url': video_url,
3460 'ext': 'mp4',
3461 'title': title,
3462 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3463 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3464 }
3465 return [info]
3466
3467 class JustinTVIE(InfoExtractor):
3468 """Information extractor for justin.tv and twitch.tv"""
3469 # TODO: One broadcast may be split into multiple videos. The key
3470 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3471 # starts at 1 and increases. Can we treat all parts as one video?
3472
3473 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3474 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3475 _JUSTIN_PAGE_LIMIT = 100
3476 IE_NAME = u'justin.tv'
3477
3478 def report_extraction(self, file_id):
3479 """Report information extraction."""
3480 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3481
3482 def report_download_page(self, channel, offset):
3483 """Report attempt to download a single page of videos."""
3484 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3485 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3486
3487 # Return count of items, list of *valid* items
3488 def _parse_page(self, url):
3489 try:
3490 urlh = compat_urllib_request.urlopen(url)
3491 webpage_bytes = urlh.read()
3492 webpage = webpage_bytes.decode('utf-8', 'ignore')
3493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3494 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3495 return
3496
3497 response = json.loads(webpage)
3498 if type(response) != list:
3499 error_text = response.get('error', 'unknown error')
3500 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3501 return
3502 info = []
3503 for clip in response:
3504 video_url = clip['video_file_url']
3505 if video_url:
3506 video_extension = os.path.splitext(video_url)[1][1:]
3507 video_date = re.sub('-', '', clip['start_time'][:10])
3508 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3509 video_id = clip['id']
3510 video_title = clip.get('title', video_id)
3511 info.append({
3512 'id': video_id,
3513 'url': video_url,
3514 'title': video_title,
3515 'uploader': clip.get('channel_name', video_uploader_id),
3516 'uploader_id': video_uploader_id,
3517 'upload_date': video_date,
3518 'ext': video_extension,
3519 })
3520 return (len(response), info)
3521
3522 def _real_extract(self, url):
3523 mobj = re.match(self._VALID_URL, url)
3524 if mobj is None:
3525 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3526 return
3527
3528 api = 'http://api.justin.tv'
3529 video_id = mobj.group(mobj.lastindex)
3530 paged = False
3531 if mobj.lastindex == 1:
3532 paged = True
3533 api += '/channel/archives/%s.json'
3534 else:
3535 api += '/broadcast/by_archive/%s.json'
3536 api = api % (video_id,)
3537
3538 self.report_extraction(video_id)
3539
3540 info = []
3541 offset = 0
3542 limit = self._JUSTIN_PAGE_LIMIT
3543 while True:
3544 if paged:
3545 self.report_download_page(video_id, offset)
3546 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3547 page_count, page_info = self._parse_page(page_url)
3548 info.extend(page_info)
3549 if not paged or page_count != limit:
3550 break
3551 offset += limit
3552 return info
3553
3554 class FunnyOrDieIE(InfoExtractor):
3555 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3556
3557 def _real_extract(self, url):
3558 mobj = re.match(self._VALID_URL, url)
3559 if mobj is None:
3560 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3561 return
3562
3563 video_id = mobj.group('id')
3564 webpage = self._download_webpage(url, video_id)
3565
3566 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3567 if not m:
3568 self._downloader.trouble(u'ERROR: unable to find video information')
3569 video_url = unescapeHTML(m.group('url'))
3570
3571 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3572 if not m:
3573 self._downloader.trouble(u'Cannot find video title')
3574 title = unescapeHTML(m.group('title'))
3575
3576 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3577 if m:
3578 desc = unescapeHTML(m.group('desc'))
3579 else:
3580 desc = None
3581
3582 info = {
3583 'id': video_id,
3584 'url': video_url,
3585 'ext': 'mp4',
3586 'title': title,
3587 'description': desc,
3588 }
3589 return [info]
3590
3591 class SteamIE(InfoExtractor):
3592 _VALID_URL = r"""http://store.steampowered.com/
3593 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3594 (?P<gameID>\d+)/?
3595 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3596 """
3597
3598 @classmethod
3599 def suitable(cls, url):
3600 """Receives a URL and returns True if suitable for this IE."""
3601 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3602
3603 def _real_extract(self, url):
3604 m = re.match(self._VALID_URL, url, re.VERBOSE)
3605 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3606 gameID = m.group('gameID')
3607 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3608 webpage = self._download_webpage(videourl, gameID)
3609 mweb = re.finditer(urlRE, webpage)
3610 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3611 titles = re.finditer(namesRE, webpage)
3612 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3613 thumbs = re.finditer(thumbsRE, webpage)
3614 videos = []
3615 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3616 video_id = vid.group('videoID')
3617 title = vtitle.group('videoName')
3618 video_url = vid.group('videoURL')
3619 video_thumb = thumb.group('thumbnail')
3620 if not video_url:
3621 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3622 info = {
3623 'id':video_id,
3624 'url':video_url,
3625 'ext': 'flv',
3626 'title': unescapeHTML(title),
3627 'thumbnail': video_thumb
3628 }
3629 videos.append(info)
3630 return videos
3631
3632 class UstreamIE(InfoExtractor):
3633 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3634 IE_NAME = u'ustream'
3635
3636 def _real_extract(self, url):
3637 m = re.match(self._VALID_URL, url)
3638 video_id = m.group('videoID')
3639 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3640 webpage = self._download_webpage(url, video_id)
3641 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3642 title = m.group('title')
3643 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3644 uploader = m.group('uploader')
3645 info = {
3646 'id':video_id,
3647 'url':video_url,
3648 'ext': 'flv',
3649 'title': title,
3650 'uploader': uploader
3651 }
3652 return [info]
3653
3654 class RBMARadioIE(InfoExtractor):
3655 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3656
3657 def _real_extract(self, url):
3658 m = re.match(self._VALID_URL, url)
3659 video_id = m.group('videoID')
3660
3661 webpage = self._download_webpage(url, video_id)
3662 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3663 if not m:
3664 raise ExtractorError(u'Cannot find metadata')
3665 json_data = m.group(1)
3666
3667 try:
3668 data = json.loads(json_data)
3669 except ValueError as e:
3670 raise ExtractorError(u'Invalid JSON: ' + str(e))
3671
3672 video_url = data['akamai_url'] + '&cbr=256'
3673 url_parts = compat_urllib_parse_urlparse(video_url)
3674 video_ext = url_parts.path.rpartition('.')[2]
3675 info = {
3676 'id': video_id,
3677 'url': video_url,
3678 'ext': video_ext,
3679 'title': data['title'],
3680 'description': data.get('teaser_text'),
3681 'location': data.get('country_of_origin'),
3682 'uploader': data.get('host', {}).get('name'),
3683 'uploader_id': data.get('host', {}).get('slug'),
3684 'thumbnail': data.get('image', {}).get('large_url_2x'),
3685 'duration': data.get('duration'),
3686 }
3687 return [info]
3688
3689
3690 class YouPornIE(InfoExtractor):
3691 """Information extractor for youporn.com."""
3692 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3693
3694 def _print_formats(self, formats):
3695 """Print all available formats"""
3696 print(u'Available formats:')
3697 print(u'ext\t\tformat')
3698 print(u'---------------------------------')
3699 for format in formats:
3700 print(u'%s\t\t%s' % (format['ext'], format['format']))
3701
3702 def _specific(self, req_format, formats):
3703 for x in formats:
3704 if(x["format"]==req_format):
3705 return x
3706 return None
3707
3708 def _real_extract(self, url):
3709 mobj = re.match(self._VALID_URL, url)
3710 if mobj is None:
3711 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3712 return
3713
3714 video_id = mobj.group('videoid')
3715
3716 req = compat_urllib_request.Request(url)
3717 req.add_header('Cookie', 'age_verified=1')
3718 webpage = self._download_webpage(req, video_id)
3719
3720 # Get the video title
3721 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3722 if result is None:
3723 raise ExtractorError(u'Unable to extract video title')
3724 video_title = result.group('title').strip()
3725
3726 # Get the video date
3727 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3728 if result is None:
3729 self._downloader.report_warning(u'unable to extract video date')
3730 upload_date = None
3731 else:
3732 upload_date = result.group('date').strip()
3733
3734 # Get the video uploader
3735 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3736 if result is None:
3737 self._downloader.report_warning(u'unable to extract uploader')
3738 video_uploader = None
3739 else:
3740 video_uploader = result.group('uploader').strip()
3741 video_uploader = clean_html( video_uploader )
3742
3743 # Get all of the formats available
3744 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3745 result = re.search(DOWNLOAD_LIST_RE, webpage)
3746 if result is None:
3747 raise ExtractorError(u'Unable to extract download list')
3748 download_list_html = result.group('download_list').strip()
3749
3750 # Get all of the links from the page
3751 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3752 links = re.findall(LINK_RE, download_list_html)
3753 if(len(links) == 0):
3754 raise ExtractorError(u'ERROR: no known formats available for video')
3755
3756 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3757
3758 formats = []
3759 for link in links:
3760
3761 # A link looks like this:
3762 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3763 # A path looks like this:
3764 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3765 video_url = unescapeHTML( link )
3766 path = compat_urllib_parse_urlparse( video_url ).path
3767 extension = os.path.splitext( path )[1][1:]
3768 format = path.split('/')[4].split('_')[:2]
3769 size = format[0]
3770 bitrate = format[1]
3771 format = "-".join( format )
3772 title = u'%s-%s-%s' % (video_title, size, bitrate)
3773
3774 formats.append({
3775 'id': video_id,
3776 'url': video_url,
3777 'uploader': video_uploader,
3778 'upload_date': upload_date,
3779 'title': title,
3780 'ext': extension,
3781 'format': format,
3782 'thumbnail': None,
3783 'description': None,
3784 'player_url': None
3785 })
3786
3787 if self._downloader.params.get('listformats', None):
3788 self._print_formats(formats)
3789 return
3790
3791 req_format = self._downloader.params.get('format', None)
3792 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3793
3794 if req_format is None or req_format == 'best':
3795 return [formats[0]]
3796 elif req_format == 'worst':
3797 return [formats[-1]]
3798 elif req_format in ('-1', 'all'):
3799 return formats
3800 else:
3801 format = self._specific( req_format, formats )
3802 if result is None:
3803 self._downloader.trouble(u'ERROR: requested format not available')
3804 return
3805 return [format]
3806
3807
3808
3809 class PornotubeIE(InfoExtractor):
3810 """Information extractor for pornotube.com."""
3811 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3812
3813 def _real_extract(self, url):
3814 mobj = re.match(self._VALID_URL, url)
3815 if mobj is None:
3816 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3817 return
3818
3819 video_id = mobj.group('videoid')
3820 video_title = mobj.group('title')
3821
3822 # Get webpage content
3823 webpage = self._download_webpage(url, video_id)
3824
3825 # Get the video URL
3826 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3827 result = re.search(VIDEO_URL_RE, webpage)
3828 if result is None:
3829 self._downloader.trouble(u'ERROR: unable to extract video url')
3830 return
3831 video_url = compat_urllib_parse.unquote(result.group('url'))
3832
3833 #Get the uploaded date
3834 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3835 result = re.search(VIDEO_UPLOADED_RE, webpage)
3836 if result is None:
3837 self._downloader.trouble(u'ERROR: unable to extract video title')
3838 return
3839 upload_date = result.group('date')
3840
3841 info = {'id': video_id,
3842 'url': video_url,
3843 'uploader': None,
3844 'upload_date': upload_date,
3845 'title': video_title,
3846 'ext': 'flv',
3847 'format': 'flv'}
3848
3849 return [info]
3850
3851 class YouJizzIE(InfoExtractor):
3852 """Information extractor for youjizz.com."""
3853 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3854
3855 def _real_extract(self, url):
3856 mobj = re.match(self._VALID_URL, url)
3857 if mobj is None:
3858 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3859 return
3860
3861 video_id = mobj.group('videoid')
3862
3863 # Get webpage content
3864 webpage = self._download_webpage(url, video_id)
3865
3866 # Get the video title
3867 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3868 if result is None:
3869 raise ExtractorError(u'ERROR: unable to extract video title')
3870 video_title = result.group('title').strip()
3871
3872 # Get the embed page
3873 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3874 if result is None:
3875 raise ExtractorError(u'ERROR: unable to extract embed page')
3876
3877 embed_page_url = result.group(0).strip()
3878 video_id = result.group('videoid')
3879
3880 webpage = self._download_webpage(embed_page_url, video_id)
3881
3882 # Get the video URL
3883 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3884 if result is None:
3885 raise ExtractorError(u'ERROR: unable to extract video url')
3886 video_url = result.group('source')
3887
3888 info = {'id': video_id,
3889 'url': video_url,
3890 'title': video_title,
3891 'ext': 'flv',
3892 'format': 'flv',
3893 'player_url': embed_page_url}
3894
3895 return [info]
3896
3897 class EightTracksIE(InfoExtractor):
3898 IE_NAME = '8tracks'
3899 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3900
3901 def _real_extract(self, url):
3902 mobj = re.match(self._VALID_URL, url)
3903 if mobj is None:
3904 raise ExtractorError(u'Invalid URL: %s' % url)
3905 playlist_id = mobj.group('id')
3906
3907 webpage = self._download_webpage(url, playlist_id)
3908
3909 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3910 if not m:
3911 raise ExtractorError(u'Cannot find trax information')
3912 json_like = m.group(1)
3913 data = json.loads(json_like)
3914
3915 session = str(random.randint(0, 1000000000))
3916 mix_id = data['id']
3917 track_count = data['tracks_count']
3918 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3919 next_url = first_url
3920 res = []
3921 for i in itertools.count():
3922 api_json = self._download_webpage(next_url, playlist_id,
3923 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3924 errnote=u'Failed to download song information')
3925 api_data = json.loads(api_json)
3926 track_data = api_data[u'set']['track']
3927 info = {
3928 'id': track_data['id'],
3929 'url': track_data['track_file_stream_url'],
3930 'title': track_data['performer'] + u' - ' + track_data['name'],
3931 'raw_title': track_data['name'],
3932 'uploader_id': data['user']['login'],
3933 'ext': 'm4a',
3934 }
3935 res.append(info)
3936 if api_data['set']['at_last_track']:
3937 break
3938 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3939 return res
3940
3941 class KeekIE(InfoExtractor):
3942 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3943 IE_NAME = u'keek'
3944
3945 def _real_extract(self, url):
3946 m = re.match(self._VALID_URL, url)
3947 video_id = m.group('videoID')
3948 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3949 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3950 webpage = self._download_webpage(url, video_id)
3951 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3952 title = unescapeHTML(m.group('title'))
3953 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3954 uploader = unescapeHTML(m.group('uploader'))
3955 info = {
3956 'id':video_id,
3957 'url':video_url,
3958 'ext': 'mp4',
3959 'title': title,
3960 'thumbnail': thumbnail,
3961 'uploader': uploader
3962 }
3963 return [info]
3964
3965 class TEDIE(InfoExtractor):
3966 _VALID_URL=r'''http://www.ted.com/
3967 (
3968 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3969 |
3970 ((?P<type_talk>talks)) # We have a simple talk
3971 )
3972 /(?P<name>\w+) # Here goes the name and then ".html"
3973 '''
3974
3975 @classmethod
3976 def suitable(cls, url):
3977 """Receives a URL and returns True if suitable for this IE."""
3978 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3979
3980 def _real_extract(self, url):
3981 m=re.match(self._VALID_URL, url, re.VERBOSE)
3982 if m.group('type_talk'):
3983 return [self._talk_info(url)]
3984 else :
3985 playlist_id=m.group('playlist_id')
3986 name=m.group('name')
3987 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3988 return self._playlist_videos_info(url,name,playlist_id)
3989
3990 def _talk_video_link(self,mediaSlug):
3991 '''Returns the video link for that mediaSlug'''
3992 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3993
3994 def _playlist_videos_info(self,url,name,playlist_id=0):
3995 '''Returns the videos of the playlist'''
3996 video_RE=r'''
3997 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3998 ([.\s]*?)data-playlist_item_id="(\d+)"
3999 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4000 '''
4001 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4002 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4003 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4004 m_names=re.finditer(video_name_RE,webpage)
4005 info=[]
4006 for m_video, m_name in zip(m_videos,m_names):
4007 video_id=m_video.group('video_id')
4008 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4009 info.append(self._talk_info(talk_url,video_id))
4010 return info
4011
4012 def _talk_info(self, url, video_id=0):
4013 """Return the video for the talk in the url"""
4014 m=re.match(self._VALID_URL, url,re.VERBOSE)
4015 videoName=m.group('name')
4016 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4017 # If the url includes the language we get the title translated
4018 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4019 title=re.search(title_RE, webpage).group('title')
4020 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4021 "id":(?P<videoID>[\d]+).*?
4022 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4023 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4024 thumb_match=re.search(thumb_RE,webpage)
4025 info_match=re.search(info_RE,webpage,re.VERBOSE)
4026 video_id=info_match.group('videoID')
4027 mediaSlug=info_match.group('mediaSlug')
4028 video_url=self._talk_video_link(mediaSlug)
4029 info = {
4030 'id': video_id,
4031 'url': video_url,
4032 'ext': 'mp4',
4033 'title': title,
4034 'thumbnail': thumb_match.group('thumbnail')
4035 }
4036 return info
4037
4038 class MySpassIE(InfoExtractor):
4039 _VALID_URL = r'http://www.myspass.de/.*'
4040
4041 def _real_extract(self, url):
4042 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4043
4044 # video id is the last path element of the URL
4045 # usually there is a trailing slash, so also try the second but last
4046 url_path = compat_urllib_parse_urlparse(url).path
4047 url_parent_path, video_id = os.path.split(url_path)
4048 if not video_id:
4049 _, video_id = os.path.split(url_parent_path)
4050
4051 # get metadata
4052 metadata_url = META_DATA_URL_TEMPLATE % video_id
4053 metadata_text = self._download_webpage(metadata_url, video_id)
4054 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4055
4056 # extract values from metadata
4057 url_flv_el = metadata.find('url_flv')
4058 if url_flv_el is None:
4059 self._downloader.trouble(u'ERROR: unable to extract download url')
4060 return
4061 video_url = url_flv_el.text
4062 extension = os.path.splitext(video_url)[1][1:]
4063 title_el = metadata.find('title')
4064 if title_el is None:
4065 self._downloader.trouble(u'ERROR: unable to extract title')
4066 return
4067 title = title_el.text
4068 format_id_el = metadata.find('format_id')
4069 if format_id_el is None:
4070 format = ext
4071 else:
4072 format = format_id_el.text
4073 description_el = metadata.find('description')
4074 if description_el is not None:
4075 description = description_el.text
4076 else:
4077 description = None
4078 imagePreview_el = metadata.find('imagePreview')
4079 if imagePreview_el is not None:
4080 thumbnail = imagePreview_el.text
4081 else:
4082 thumbnail = None
4083 info = {
4084 'id': video_id,
4085 'url': video_url,
4086 'title': title,
4087 'ext': extension,
4088 'format': format,
4089 'thumbnail': thumbnail,
4090 'description': description
4091 }
4092 return [info]
4093
4094 def gen_extractors():
4095 """ Return a list of an instance of every supported extractor.
4096 The order does matter; the first extractor matched is the one handling the URL.
4097 """
4098 return [
4099 YoutubePlaylistIE(),
4100 YoutubeChannelIE(),
4101 YoutubeUserIE(),
4102 YoutubeSearchIE(),
4103 YoutubeIE(),
4104 MetacafeIE(),
4105 DailymotionIE(),
4106 GoogleSearchIE(),
4107 PhotobucketIE(),
4108 YahooIE(),
4109 YahooSearchIE(),
4110 DepositFilesIE(),
4111 FacebookIE(),
4112 BlipTVUserIE(),
4113 BlipTVIE(),
4114 VimeoIE(),
4115 MyVideoIE(),
4116 ComedyCentralIE(),
4117 EscapistIE(),
4118 CollegeHumorIE(),
4119 XVideosIE(),
4120 SoundcloudIE(),
4121 InfoQIE(),
4122 MixcloudIE(),
4123 StanfordOpenClassroomIE(),
4124 MTVIE(),
4125 YoukuIE(),
4126 XNXXIE(),
4127 YouJizzIE(),
4128 PornotubeIE(),
4129 YouPornIE(),
4130 GooglePlusIE(),
4131 ArteTvIE(),
4132 NBAIE(),
4133 JustinTVIE(),
4134 FunnyOrDieIE(),
4135 SteamIE(),
4136 UstreamIE(),
4137 RBMARadioIE(),
4138 EightTracksIE(),
4139 KeekIE(),
4140 TEDIE(),
4141 MySpassIE(),
4142 GenericIE()
4143 ]
4144
4145