]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Add tests to MySpass
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23 """Information Extractor class.
24
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
32
33 The dictionaries must include the following fields:
34
35 id: Video identifier.
36 url: Final video URL.
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
53
54 The fields should all be Unicode strings.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59
60 _real_extract() must return a *list* of information dictionaries as
61 described above.
62
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
65 """
66
67 _ready = False
68 _downloader = None
69 _WORKING = True
70
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
73 self._ready = False
74 self.set_downloader(downloader)
75
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
79
80 def working(self):
81 """Getter method for _WORKING."""
82 return self._WORKING
83
84 def initialize(self):
85 """Initializes an instance (authentication, etc)."""
86 if not self._ready:
87 self._real_initialize()
88 self._ready = True
89
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
92 self.initialize()
93 return self._real_extract(url)
94
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
98
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
101 pass
102
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
105 pass
106
107 @property
108 def IE_NAME(self):
109 return type(self).__name__[:-2]
110
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
113 if note is None:
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116 try:
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 if errnote is None:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
132
133 _VALID_URL = r"""^
134 (
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 v=
147 )
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
152 $"""
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
162 '13': '3gp',
163 '17': 'mp4',
164 '18': 'mp4',
165 '22': 'mp4',
166 '37': 'mp4',
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168 '43': 'webm',
169 '44': 'webm',
170 '45': 'webm',
171 '46': 'webm',
172 }
173 _video_dimensions = {
174 '5': '240x400',
175 '6': '???',
176 '13': '???',
177 '17': '144x176',
178 '18': '360x640',
179 '22': '720x1280',
180 '34': '360x640',
181 '35': '480x854',
182 '37': '1080x1920',
183 '38': '3072x4096',
184 '43': '360x640',
185 '44': '480x854',
186 '45': '720x1280',
187 '46': '1080x1920',
188 }
189 IE_NAME = u'youtube'
190
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
198
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
202
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
206
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231 def _closed_captions_xml_to_srt(self, xml_string):
232 srt = ''
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
237 start = float(start)
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
246 return srt
247
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251 try:
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
262 srt_lang = 'en'
263 else:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
268 'lang': srt_lang,
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
270 'v': video_id,
271 })
272 url = 'http://www.youtube.com/api/timedtext?' + params
273 try:
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277 if not srt_xml:
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
280
281 def _print_formats(self, formats):
282 print('Available formats:')
283 for x in formats:
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
285
286 def _real_initialize(self):
287 if self._downloader is None:
288 return
289
290 username = None
291 password = None
292 downloader_params = self._downloader.params
293
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
299 try:
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
301 if info is not None:
302 username = info[0]
303 password = info[2]
304 else:
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
308 return
309
310 # Set language
311 request = compat_urllib_request.Request(self._LANG_URL)
312 try:
313 self.report_lang()
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
317 return
318
319 # No authentication to be performed
320 if username is None:
321 return
322
323 request = compat_urllib_request.Request(self._LOGIN_URL)
324 try:
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
328 return
329
330 galx = None
331 dsh = None
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
333 if match:
334 galx = match.group(1)
335
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
337 if match:
338 dsh = match.group(1)
339
340 # Log in
341 login_form_strs = {
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
343 u'Email': username,
344 u'GALX': galx,
345 u'Passwd': password,
346 u'PersistentCookie': u'yes',
347 u'_utf8': u'霱',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
351 u'dnConn': u'',
352 u'dsh': dsh,
353 u'pstMsg': u'0',
354 u'rmShown': u'1',
355 u'secTok': u'',
356 u'signIn': u'Sign in',
357 u'timeStmp': u'',
358 u'service': u'youtube',
359 u'uilel': u'3',
360 u'hl': u'en_US',
361 }
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
363 # chokes on unicode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
367 try:
368 self.report_login()
369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
372 return
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
375 return
376
377 # Confirm age
378 age_form = {
379 'next_url': '/',
380 'action_confirm': 'Confirm',
381 }
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
383 try:
384 self.report_age_confirmation()
385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
388 return
389
390 def _extract_id(self, url):
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
392 if mobj is None:
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
394 return
395 video_id = mobj.group(2)
396 return video_id
397
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
401 if mobj:
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
404
405 # Get video webpage
406 self.report_video_webpage_download(video_id)
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
409 try:
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
413 return
414
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
416
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
419 if mobj is not None:
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
421 else:
422 player_url = None
423
424 # Get video info
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
430 try:
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
435 break
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
438 return
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
442 else:
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
444 return
445
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
449 return
450
451 # Start extracting information
452 self.report_information_extraction(video_id)
453
454 # uploader
455 if 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
457 return
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
459
460 # uploader_id
461 video_uploader_id = None
462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
463 if mobj is not None:
464 video_uploader_id = mobj.group(1)
465 else:
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
467
468 # title
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
471 return
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
473
474 # thumbnail image
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
477 video_thumbnail = ''
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
480
481 # upload date
482 upload_date = None
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
484 if mobj is not None:
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
488 try:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
490 except:
491 pass
492
493 # description
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
497 else:
498 video_description = ''
499
500 # closed captions
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
504 if srt_error:
505 self._downloader.trouble(srt_error)
506
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
509 video_duration = ''
510 else:
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
512
513 # token
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
515
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
518
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
527
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
532 else:
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
537 return
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
540 return
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
547 else:
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
553 if rf in url_map:
554 video_url_list = [(rf, url_map[rf])]
555 break
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
558 return
559 else:
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
561 return
562
563 results = []
564 for format_param, video_real_url in video_url_list:
565 # Extension
566 video_extension = self._video_extensions.get(format_param, 'flv')
567
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
570
571 results.append({
572 'id': video_id,
573 'url': video_real_url,
574 'uploader': video_uploader,
575 'uploader_id': video_uploader_id,
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
585 })
586 return results
587
588
589 class MetacafeIE(InfoExtractor):
590 """Information Extractor for metacafe.com."""
591
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
596
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
599
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
603
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
607
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
611
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
615
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
619 try:
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
624 return
625
626 # Confirm age
627 disclaimer_form = {
628 'filters': '0',
629 'submit': "Continue - I'm over 18",
630 }
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
632 try:
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
637 return
638
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
642 if mobj is None:
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
644 return
645
646 video_id = mobj.group(1)
647
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
652 return
653
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
656 try:
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
661 return
662
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
666 if mobj is not None:
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
669
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
672 if mobj is None:
673 video_url = mediaURL
674 else:
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
677 else:
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
679 if mobj is None:
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 return
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
685 return
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
687 if mobj is None:
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
689 return
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
693
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
695 if mobj is None:
696 self._downloader.trouble(u'ERROR: unable to extract title')
697 return
698 video_title = mobj.group(1).decode('utf-8')
699
700 mobj = re.search(r'submitter=(.*?);', webpage)
701 if mobj is None:
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
703 return
704 video_uploader = mobj.group(1)
705
706 return [{
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
710 'upload_date': None,
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
713 }]
714
715
716 class DailymotionIE(InfoExtractor):
717 """Information Extractor for Dailymotion"""
718
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
721
722 def __init__(self, downloader=None):
723 InfoExtractor.__init__(self, downloader)
724
725 def report_extraction(self, video_id):
726 """Report information extraction."""
727 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
728
729 def _real_extract(self, url):
730 # Extract id and simplified title from URL
731 mobj = re.match(self._VALID_URL, url)
732 if mobj is None:
733 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
734 return
735
736 video_id = mobj.group(1).split('_')[0].split('?')[0]
737
738 video_extension = 'mp4'
739
740 # Retrieve video webpage to extract further information
741 request = compat_urllib_request.Request(url)
742 request.add_header('Cookie', 'family_filter=off')
743 webpage = self._download_webpage(request, video_id)
744
745 # Extract URL, uploader and title from webpage
746 self.report_extraction(video_id)
747 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
748 if mobj is None:
749 self._downloader.trouble(u'ERROR: unable to extract media URL')
750 return
751 flashvars = compat_urllib_parse.unquote(mobj.group(1))
752
753 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
754 if key in flashvars:
755 max_quality = key
756 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
757 break
758 else:
759 self._downloader.trouble(u'ERROR: unable to extract video URL')
760 return
761
762 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
763 if mobj is None:
764 self._downloader.trouble(u'ERROR: unable to extract video URL')
765 return
766
767 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
768
769 # TODO: support choosing qualities
770
771 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
772 if mobj is None:
773 self._downloader.trouble(u'ERROR: unable to extract title')
774 return
775 video_title = unescapeHTML(mobj.group('title'))
776
777 video_uploader = None
778 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
779 if mobj is None:
780 # lookin for official user
781 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
782 if mobj_official is None:
783 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
784 else:
785 video_uploader = mobj_official.group(1)
786 else:
787 video_uploader = mobj.group(1)
788
789 video_upload_date = None
790 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
791 if mobj is not None:
792 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
793
794 return [{
795 'id': video_id,
796 'url': video_url,
797 'uploader': video_uploader,
798 'upload_date': video_upload_date,
799 'title': video_title,
800 'ext': video_extension,
801 }]
802
803
804 class PhotobucketIE(InfoExtractor):
805 """Information extractor for photobucket.com."""
806
807 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
808 IE_NAME = u'photobucket'
809
810 def __init__(self, downloader=None):
811 InfoExtractor.__init__(self, downloader)
812
813 def report_download_webpage(self, video_id):
814 """Report webpage download."""
815 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
816
817 def report_extraction(self, video_id):
818 """Report information extraction."""
819 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
820
821 def _real_extract(self, url):
822 # Extract id from URL
823 mobj = re.match(self._VALID_URL, url)
824 if mobj is None:
825 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
826 return
827
828 video_id = mobj.group(1)
829
830 video_extension = 'flv'
831
832 # Retrieve video webpage to extract further information
833 request = compat_urllib_request.Request(url)
834 try:
835 self.report_download_webpage(video_id)
836 webpage = compat_urllib_request.urlopen(request).read()
837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
838 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
839 return
840
841 # Extract URL, uploader, and title from webpage
842 self.report_extraction(video_id)
843 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
844 if mobj is None:
845 self._downloader.trouble(u'ERROR: unable to extract media URL')
846 return
847 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
848
849 video_url = mediaURL
850
851 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
852 if mobj is None:
853 self._downloader.trouble(u'ERROR: unable to extract title')
854 return
855 video_title = mobj.group(1).decode('utf-8')
856
857 video_uploader = mobj.group(2).decode('utf-8')
858
859 return [{
860 'id': video_id.decode('utf-8'),
861 'url': video_url.decode('utf-8'),
862 'uploader': video_uploader,
863 'upload_date': None,
864 'title': video_title,
865 'ext': video_extension.decode('utf-8'),
866 }]
867
868
869 class YahooIE(InfoExtractor):
870 """Information extractor for video.yahoo.com."""
871
872 _WORKING = False
873 # _VALID_URL matches all Yahoo! Video URLs
874 # _VPAGE_URL matches only the extractable '/watch/' URLs
875 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
876 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
877 IE_NAME = u'video.yahoo'
878
879 def __init__(self, downloader=None):
880 InfoExtractor.__init__(self, downloader)
881
882 def report_download_webpage(self, video_id):
883 """Report webpage download."""
884 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
885
886 def report_extraction(self, video_id):
887 """Report information extraction."""
888 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
889
890 def _real_extract(self, url, new_video=True):
891 # Extract ID from URL
892 mobj = re.match(self._VALID_URL, url)
893 if mobj is None:
894 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
895 return
896
897 video_id = mobj.group(2)
898 video_extension = 'flv'
899
900 # Rewrite valid but non-extractable URLs as
901 # extractable English language /watch/ URLs
902 if re.match(self._VPAGE_URL, url) is None:
903 request = compat_urllib_request.Request(url)
904 try:
905 webpage = compat_urllib_request.urlopen(request).read()
906 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
907 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
908 return
909
910 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
911 if mobj is None:
912 self._downloader.trouble(u'ERROR: Unable to extract id field')
913 return
914 yahoo_id = mobj.group(1)
915
916 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
917 if mobj is None:
918 self._downloader.trouble(u'ERROR: Unable to extract vid field')
919 return
920 yahoo_vid = mobj.group(1)
921
922 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
923 return self._real_extract(url, new_video=False)
924
925 # Retrieve video webpage to extract further information
926 request = compat_urllib_request.Request(url)
927 try:
928 self.report_download_webpage(video_id)
929 webpage = compat_urllib_request.urlopen(request).read()
930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
931 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
932 return
933
934 # Extract uploader and title from webpage
935 self.report_extraction(video_id)
936 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
937 if mobj is None:
938 self._downloader.trouble(u'ERROR: unable to extract video title')
939 return
940 video_title = mobj.group(1).decode('utf-8')
941
942 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
943 if mobj is None:
944 self._downloader.trouble(u'ERROR: unable to extract video uploader')
945 return
946 video_uploader = mobj.group(1).decode('utf-8')
947
948 # Extract video thumbnail
949 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
950 if mobj is None:
951 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
952 return
953 video_thumbnail = mobj.group(1).decode('utf-8')
954
955 # Extract video description
956 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
957 if mobj is None:
958 self._downloader.trouble(u'ERROR: unable to extract video description')
959 return
960 video_description = mobj.group(1).decode('utf-8')
961 if not video_description:
962 video_description = 'No description available.'
963
964 # Extract video height and width
965 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
966 if mobj is None:
967 self._downloader.trouble(u'ERROR: unable to extract video height')
968 return
969 yv_video_height = mobj.group(1)
970
971 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video width')
974 return
975 yv_video_width = mobj.group(1)
976
977 # Retrieve video playlist to extract media URL
978 # I'm not completely sure what all these options are, but we
979 # seem to need most of them, otherwise the server sends a 401.
980 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
981 yv_bitrate = '700' # according to Wikipedia this is hard-coded
982 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
983 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
984 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
985 try:
986 self.report_download_webpage(video_id)
987 webpage = compat_urllib_request.urlopen(request).read()
988 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
989 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
990 return
991
992 # Extract media URL from playlist XML
993 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
994 if mobj is None:
995 self._downloader.trouble(u'ERROR: Unable to extract media URL')
996 return
997 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
998 video_url = unescapeHTML(video_url)
999
1000 return [{
1001 'id': video_id.decode('utf-8'),
1002 'url': video_url,
1003 'uploader': video_uploader,
1004 'upload_date': None,
1005 'title': video_title,
1006 'ext': video_extension.decode('utf-8'),
1007 'thumbnail': video_thumbnail.decode('utf-8'),
1008 'description': video_description,
1009 }]
1010
1011
1012 class VimeoIE(InfoExtractor):
1013 """Information extractor for vimeo.com."""
1014
1015 # _VALID_URL matches Vimeo URLs
1016 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1017 IE_NAME = u'vimeo'
1018
1019 def __init__(self, downloader=None):
1020 InfoExtractor.__init__(self, downloader)
1021
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1025
1026 def report_extraction(self, video_id):
1027 """Report information extraction."""
1028 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1029
1030 def _real_extract(self, url, new_video=True):
1031 # Extract ID from URL
1032 mobj = re.match(self._VALID_URL, url)
1033 if mobj is None:
1034 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1035 return
1036
1037 video_id = mobj.group('id')
1038 if not mobj.group('proto'):
1039 url = 'https://' + url
1040 if mobj.group('direct_link'):
1041 url = 'https://vimeo.com/' + video_id
1042
1043 # Retrieve video webpage to extract further information
1044 request = compat_urllib_request.Request(url, None, std_headers)
1045 try:
1046 self.report_download_webpage(video_id)
1047 webpage_bytes = compat_urllib_request.urlopen(request).read()
1048 webpage = webpage_bytes.decode('utf-8')
1049 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1050 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1051 return
1052
1053 # Now we begin extracting as much information as we can from what we
1054 # retrieved. First we extract the information common to all extractors,
1055 # and latter we extract those that are Vimeo specific.
1056 self.report_extraction(video_id)
1057
1058 # Extract the config JSON
1059 try:
1060 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061 config = json.loads(config)
1062 except:
1063 self._downloader.trouble(u'ERROR: unable to extract info section')
1064 return
1065
1066 # Extract title
1067 video_title = config["video"]["title"]
1068
1069 # Extract uploader and uploader_id
1070 video_uploader = config["video"]["owner"]["name"]
1071 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1072
1073 # Extract video thumbnail
1074 video_thumbnail = config["video"]["thumbnail"]
1075
1076 # Extract video description
1077 video_description = get_element_by_attribute("itemprop", "description", webpage)
1078 if video_description: video_description = clean_html(video_description)
1079 else: video_description = ''
1080
1081 # Extract upload date
1082 video_upload_date = None
1083 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1084 if mobj is not None:
1085 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1086
1087 # Vimeo specific: extract request signature and timestamp
1088 sig = config['request']['signature']
1089 timestamp = config['request']['timestamp']
1090
1091 # Vimeo specific: extract video codec and quality information
1092 # First consider quality, then codecs, then take everything
1093 # TODO bind to format param
1094 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1095 files = { 'hd': [], 'sd': [], 'other': []}
1096 for codec_name, codec_extension in codecs:
1097 if codec_name in config["video"]["files"]:
1098 if 'hd' in config["video"]["files"][codec_name]:
1099 files['hd'].append((codec_name, codec_extension, 'hd'))
1100 elif 'sd' in config["video"]["files"][codec_name]:
1101 files['sd'].append((codec_name, codec_extension, 'sd'))
1102 else:
1103 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1104
1105 for quality in ('hd', 'sd', 'other'):
1106 if len(files[quality]) > 0:
1107 video_quality = files[quality][0][2]
1108 video_codec = files[quality][0][0]
1109 video_extension = files[quality][0][1]
1110 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1111 break
1112 else:
1113 self._downloader.trouble(u'ERROR: no known codec found')
1114 return
1115
1116 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1117 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1118
1119 return [{
1120 'id': video_id,
1121 'url': video_url,
1122 'uploader': video_uploader,
1123 'uploader_id': video_uploader_id,
1124 'upload_date': video_upload_date,
1125 'title': video_title,
1126 'ext': video_extension,
1127 'thumbnail': video_thumbnail,
1128 'description': video_description,
1129 }]
1130
1131
1132 class ArteTvIE(InfoExtractor):
1133 """arte.tv information extractor."""
1134
1135 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1136 _LIVE_URL = r'index-[0-9]+\.html$'
1137
1138 IE_NAME = u'arte.tv'
1139
1140 def __init__(self, downloader=None):
1141 InfoExtractor.__init__(self, downloader)
1142
1143 def report_download_webpage(self, video_id):
1144 """Report webpage download."""
1145 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1146
1147 def report_extraction(self, video_id):
1148 """Report information extraction."""
1149 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1150
1151 def fetch_webpage(self, url):
1152 request = compat_urllib_request.Request(url)
1153 try:
1154 self.report_download_webpage(url)
1155 webpage = compat_urllib_request.urlopen(request).read()
1156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1157 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1158 return
1159 except ValueError as err:
1160 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1161 return
1162 return webpage
1163
1164 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1165 page = self.fetch_webpage(url)
1166 mobj = re.search(regex, page, regexFlags)
1167 info = {}
1168
1169 if mobj is None:
1170 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1171 return
1172
1173 for (i, key, err) in matchTuples:
1174 if mobj.group(i) is None:
1175 self._downloader.trouble(err)
1176 return
1177 else:
1178 info[key] = mobj.group(i)
1179
1180 return info
1181
1182 def extractLiveStream(self, url):
1183 video_lang = url.split('/')[-4]
1184 info = self.grep_webpage(
1185 url,
1186 r'src="(.*?/videothek_js.*?\.js)',
1187 0,
1188 [
1189 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1190 ]
1191 )
1192 http_host = url.split('/')[2]
1193 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1194 info = self.grep_webpage(
1195 next_url,
1196 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1197 '(http://.*?\.swf).*?' +
1198 '(rtmp://.*?)\'',
1199 re.DOTALL,
1200 [
1201 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1202 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1203 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1204 ]
1205 )
1206 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1207
1208 def extractPlus7Stream(self, url):
1209 video_lang = url.split('/')[-3]
1210 info = self.grep_webpage(
1211 url,
1212 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1213 0,
1214 [
1215 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1216 ]
1217 )
1218 next_url = compat_urllib_parse.unquote(info.get('url'))
1219 info = self.grep_webpage(
1220 next_url,
1221 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1222 0,
1223 [
1224 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1225 ]
1226 )
1227 next_url = compat_urllib_parse.unquote(info.get('url'))
1228
1229 info = self.grep_webpage(
1230 next_url,
1231 r'<video id="(.*?)".*?>.*?' +
1232 '<name>(.*?)</name>.*?' +
1233 '<dateVideo>(.*?)</dateVideo>.*?' +
1234 '<url quality="hd">(.*?)</url>',
1235 re.DOTALL,
1236 [
1237 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1238 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1239 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1240 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1241 ]
1242 )
1243
1244 return {
1245 'id': info.get('id'),
1246 'url': compat_urllib_parse.unquote(info.get('url')),
1247 'uploader': u'arte.tv',
1248 'upload_date': info.get('date'),
1249 'title': info.get('title').decode('utf-8'),
1250 'ext': u'mp4',
1251 'format': u'NA',
1252 'player_url': None,
1253 }
1254
1255 def _real_extract(self, url):
1256 video_id = url.split('/')[-1]
1257 self.report_extraction(video_id)
1258
1259 if re.search(self._LIVE_URL, video_id) is not None:
1260 self.extractLiveStream(url)
1261 return
1262 else:
1263 info = self.extractPlus7Stream(url)
1264
1265 return [info]
1266
1267
1268 class GenericIE(InfoExtractor):
1269 """Generic last-resort information extractor."""
1270
1271 _VALID_URL = r'.*'
1272 IE_NAME = u'generic'
1273
1274 def __init__(self, downloader=None):
1275 InfoExtractor.__init__(self, downloader)
1276
1277 def report_download_webpage(self, video_id):
1278 """Report webpage download."""
1279 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1280 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1281
1282 def report_extraction(self, video_id):
1283 """Report information extraction."""
1284 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1285
1286 def report_following_redirect(self, new_url):
1287 """Report information extraction."""
1288 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1289
1290 def _test_redirect(self, url):
1291 """Check if it is a redirect, like url shorteners, in case restart chain."""
1292 class HeadRequest(compat_urllib_request.Request):
1293 def get_method(self):
1294 return "HEAD"
1295
1296 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1297 """
1298 Subclass the HTTPRedirectHandler to make it use our
1299 HeadRequest also on the redirected URL
1300 """
1301 def redirect_request(self, req, fp, code, msg, headers, newurl):
1302 if code in (301, 302, 303, 307):
1303 newurl = newurl.replace(' ', '%20')
1304 newheaders = dict((k,v) for k,v in req.headers.items()
1305 if k.lower() not in ("content-length", "content-type"))
1306 return HeadRequest(newurl,
1307 headers=newheaders,
1308 origin_req_host=req.get_origin_req_host(),
1309 unverifiable=True)
1310 else:
1311 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1312
1313 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1314 """
1315 Fallback to GET if HEAD is not allowed (405 HTTP error)
1316 """
1317 def http_error_405(self, req, fp, code, msg, headers):
1318 fp.read()
1319 fp.close()
1320
1321 newheaders = dict((k,v) for k,v in req.headers.items()
1322 if k.lower() not in ("content-length", "content-type"))
1323 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1324 headers=newheaders,
1325 origin_req_host=req.get_origin_req_host(),
1326 unverifiable=True))
1327
1328 # Build our opener
1329 opener = compat_urllib_request.OpenerDirector()
1330 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1331 HTTPMethodFallback, HEADRedirectHandler,
1332 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1333 opener.add_handler(handler())
1334
1335 response = opener.open(HeadRequest(url))
1336 new_url = response.geturl()
1337
1338 if url == new_url:
1339 return False
1340
1341 self.report_following_redirect(new_url)
1342 self._downloader.download([new_url])
1343 return True
1344
1345 def _real_extract(self, url):
1346 if self._test_redirect(url): return
1347
1348 video_id = url.split('/')[-1]
1349 request = compat_urllib_request.Request(url)
1350 try:
1351 self.report_download_webpage(video_id)
1352 webpage = compat_urllib_request.urlopen(request).read()
1353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1354 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1355 return
1356 except ValueError as err:
1357 # since this is the last-resort InfoExtractor, if
1358 # this error is thrown, it'll be thrown here
1359 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1360 return
1361
1362 self.report_extraction(video_id)
1363 # Start with something easy: JW Player in SWFObject
1364 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1365 if mobj is None:
1366 # Broaden the search a little bit
1367 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1368 if mobj is None:
1369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1370 return
1371
1372 # It's possible that one of the regexes
1373 # matched, but returned an empty group:
1374 if mobj.group(1) is None:
1375 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376 return
1377
1378 video_url = compat_urllib_parse.unquote(mobj.group(1))
1379 video_id = os.path.basename(video_url)
1380
1381 # here's a fun little line of code for you:
1382 video_extension = os.path.splitext(video_id)[1][1:]
1383 video_id = os.path.splitext(video_id)[0]
1384
1385 # it's tempting to parse this further, but you would
1386 # have to take into account all the variations like
1387 # Video Title - Site Name
1388 # Site Name | Video Title
1389 # Video Title - Tagline | Site Name
1390 # and so on and so forth; it's just not practical
1391 mobj = re.search(r'<title>(.*)</title>', webpage)
1392 if mobj is None:
1393 self._downloader.trouble(u'ERROR: unable to extract title')
1394 return
1395 video_title = mobj.group(1)
1396
1397 # video uploader is domain name
1398 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1399 if mobj is None:
1400 self._downloader.trouble(u'ERROR: unable to extract title')
1401 return
1402 video_uploader = mobj.group(1)
1403
1404 return [{
1405 'id': video_id,
1406 'url': video_url,
1407 'uploader': video_uploader,
1408 'upload_date': None,
1409 'title': video_title,
1410 'ext': video_extension,
1411 }]
1412
1413
1414 class YoutubeSearchIE(InfoExtractor):
1415 """Information Extractor for YouTube search queries."""
1416 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1417 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1418 _max_youtube_results = 1000
1419 IE_NAME = u'youtube:search'
1420
1421 def __init__(self, downloader=None):
1422 InfoExtractor.__init__(self, downloader)
1423
1424 def report_download_page(self, query, pagenum):
1425 """Report attempt to download search page with given number."""
1426 query = query.decode(preferredencoding())
1427 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1428
1429 def _real_extract(self, query):
1430 mobj = re.match(self._VALID_URL, query)
1431 if mobj is None:
1432 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1433 return
1434
1435 prefix, query = query.split(':')
1436 prefix = prefix[8:]
1437 query = query.encode('utf-8')
1438 if prefix == '':
1439 self._download_n_results(query, 1)
1440 return
1441 elif prefix == 'all':
1442 self._download_n_results(query, self._max_youtube_results)
1443 return
1444 else:
1445 try:
1446 n = int(prefix)
1447 if n <= 0:
1448 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1449 return
1450 elif n > self._max_youtube_results:
1451 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1452 n = self._max_youtube_results
1453 self._download_n_results(query, n)
1454 return
1455 except ValueError: # parsing prefix as integer fails
1456 self._download_n_results(query, 1)
1457 return
1458
1459 def _download_n_results(self, query, n):
1460 """Downloads a specified number of results for a query"""
1461
1462 video_ids = []
1463 pagenum = 0
1464 limit = n
1465
1466 while (50 * pagenum) < limit:
1467 self.report_download_page(query, pagenum+1)
1468 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1469 request = compat_urllib_request.Request(result_url)
1470 try:
1471 data = compat_urllib_request.urlopen(request).read()
1472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1473 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1474 return
1475 api_response = json.loads(data)['data']
1476
1477 new_ids = list(video['id'] for video in api_response['items'])
1478 video_ids += new_ids
1479
1480 limit = min(n, api_response['totalItems'])
1481 pagenum += 1
1482
1483 if len(video_ids) > n:
1484 video_ids = video_ids[:n]
1485 for id in video_ids:
1486 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1487 return
1488
1489
1490 class GoogleSearchIE(InfoExtractor):
1491 """Information Extractor for Google Video search queries."""
1492 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1493 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1494 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1495 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1496 _max_google_results = 1000
1497 IE_NAME = u'video.google:search'
1498
1499 def __init__(self, downloader=None):
1500 InfoExtractor.__init__(self, downloader)
1501
1502 def report_download_page(self, query, pagenum):
1503 """Report attempt to download playlist page with given number."""
1504 query = query.decode(preferredencoding())
1505 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1506
1507 def _real_extract(self, query):
1508 mobj = re.match(self._VALID_URL, query)
1509 if mobj is None:
1510 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1511 return
1512
1513 prefix, query = query.split(':')
1514 prefix = prefix[8:]
1515 query = query.encode('utf-8')
1516 if prefix == '':
1517 self._download_n_results(query, 1)
1518 return
1519 elif prefix == 'all':
1520 self._download_n_results(query, self._max_google_results)
1521 return
1522 else:
1523 try:
1524 n = int(prefix)
1525 if n <= 0:
1526 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1527 return
1528 elif n > self._max_google_results:
1529 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1530 n = self._max_google_results
1531 self._download_n_results(query, n)
1532 return
1533 except ValueError: # parsing prefix as integer fails
1534 self._download_n_results(query, 1)
1535 return
1536
1537 def _download_n_results(self, query, n):
1538 """Downloads a specified number of results for a query"""
1539
1540 video_ids = []
1541 pagenum = 0
1542
1543 while True:
1544 self.report_download_page(query, pagenum)
1545 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1546 request = compat_urllib_request.Request(result_url)
1547 try:
1548 page = compat_urllib_request.urlopen(request).read()
1549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1550 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1551 return
1552
1553 # Extract video identifiers
1554 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1555 video_id = mobj.group(1)
1556 if video_id not in video_ids:
1557 video_ids.append(video_id)
1558 if len(video_ids) == n:
1559 # Specified n videos reached
1560 for id in video_ids:
1561 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1562 return
1563
1564 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1565 for id in video_ids:
1566 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1567 return
1568
1569 pagenum = pagenum + 1
1570
1571
1572 class YahooSearchIE(InfoExtractor):
1573 """Information Extractor for Yahoo! Video search queries."""
1574
1575 _WORKING = False
1576 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1577 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1578 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1579 _MORE_PAGES_INDICATOR = r'\s*Next'
1580 _max_yahoo_results = 1000
1581 IE_NAME = u'video.yahoo:search'
1582
1583 def __init__(self, downloader=None):
1584 InfoExtractor.__init__(self, downloader)
1585
1586 def report_download_page(self, query, pagenum):
1587 """Report attempt to download playlist page with given number."""
1588 query = query.decode(preferredencoding())
1589 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1590
1591 def _real_extract(self, query):
1592 mobj = re.match(self._VALID_URL, query)
1593 if mobj is None:
1594 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1595 return
1596
1597 prefix, query = query.split(':')
1598 prefix = prefix[8:]
1599 query = query.encode('utf-8')
1600 if prefix == '':
1601 self._download_n_results(query, 1)
1602 return
1603 elif prefix == 'all':
1604 self._download_n_results(query, self._max_yahoo_results)
1605 return
1606 else:
1607 try:
1608 n = int(prefix)
1609 if n <= 0:
1610 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1611 return
1612 elif n > self._max_yahoo_results:
1613 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1614 n = self._max_yahoo_results
1615 self._download_n_results(query, n)
1616 return
1617 except ValueError: # parsing prefix as integer fails
1618 self._download_n_results(query, 1)
1619 return
1620
1621 def _download_n_results(self, query, n):
1622 """Downloads a specified number of results for a query"""
1623
1624 video_ids = []
1625 already_seen = set()
1626 pagenum = 1
1627
1628 while True:
1629 self.report_download_page(query, pagenum)
1630 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1631 request = compat_urllib_request.Request(result_url)
1632 try:
1633 page = compat_urllib_request.urlopen(request).read()
1634 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1635 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1636 return
1637
1638 # Extract video identifiers
1639 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1640 video_id = mobj.group(1)
1641 if video_id not in already_seen:
1642 video_ids.append(video_id)
1643 already_seen.add(video_id)
1644 if len(video_ids) == n:
1645 # Specified n videos reached
1646 for id in video_ids:
1647 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1648 return
1649
1650 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1651 for id in video_ids:
1652 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1653 return
1654
1655 pagenum = pagenum + 1
1656
1657
1658 class YoutubePlaylistIE(InfoExtractor):
1659 """Information Extractor for YouTube playlists."""
1660
1661 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1662 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1663 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1664 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1665 IE_NAME = u'youtube:playlist'
1666
1667 def __init__(self, downloader=None):
1668 InfoExtractor.__init__(self, downloader)
1669
1670 def report_download_page(self, playlist_id, pagenum):
1671 """Report attempt to download playlist page with given number."""
1672 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1673
1674 def _real_extract(self, url):
1675 # Extract playlist id
1676 mobj = re.match(self._VALID_URL, url)
1677 if mobj is None:
1678 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1679 return
1680
1681 # Single video case
1682 if mobj.group(3) is not None:
1683 self._downloader.download([mobj.group(3)])
1684 return
1685
1686 # Download playlist pages
1687 # prefix is 'p' as default for playlists but there are other types that need extra care
1688 playlist_prefix = mobj.group(1)
1689 if playlist_prefix == 'a':
1690 playlist_access = 'artist'
1691 else:
1692 playlist_prefix = 'p'
1693 playlist_access = 'view_play_list'
1694 playlist_id = mobj.group(2)
1695 video_ids = []
1696 pagenum = 1
1697
1698 while True:
1699 self.report_download_page(playlist_id, pagenum)
1700 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1701 request = compat_urllib_request.Request(url)
1702 try:
1703 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1705 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1706 return
1707
1708 # Extract video identifiers
1709 ids_in_page = []
1710 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1711 if mobj.group(1) not in ids_in_page:
1712 ids_in_page.append(mobj.group(1))
1713 video_ids.extend(ids_in_page)
1714
1715 if self._MORE_PAGES_INDICATOR not in page:
1716 break
1717 pagenum = pagenum + 1
1718
1719 total = len(video_ids)
1720
1721 playliststart = self._downloader.params.get('playliststart', 1) - 1
1722 playlistend = self._downloader.params.get('playlistend', -1)
1723 if playlistend == -1:
1724 video_ids = video_ids[playliststart:]
1725 else:
1726 video_ids = video_ids[playliststart:playlistend]
1727
1728 if len(video_ids) == total:
1729 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1730 else:
1731 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1732
1733 for id in video_ids:
1734 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1735 return
1736
1737
1738 class YoutubeChannelIE(InfoExtractor):
1739 """Information Extractor for YouTube channels."""
1740
1741 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1742 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1743 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1744 IE_NAME = u'youtube:channel'
1745
1746 def report_download_page(self, channel_id, pagenum):
1747 """Report attempt to download channel page with given number."""
1748 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1749
1750 def _real_extract(self, url):
1751 # Extract channel id
1752 mobj = re.match(self._VALID_URL, url)
1753 if mobj is None:
1754 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1755 return
1756
1757 # Download channel pages
1758 channel_id = mobj.group(1)
1759 video_ids = []
1760 pagenum = 1
1761
1762 while True:
1763 self.report_download_page(channel_id, pagenum)
1764 url = self._TEMPLATE_URL % (channel_id, pagenum)
1765 request = compat_urllib_request.Request(url)
1766 try:
1767 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1770 return
1771
1772 # Extract video identifiers
1773 ids_in_page = []
1774 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1775 if mobj.group(1) not in ids_in_page:
1776 ids_in_page.append(mobj.group(1))
1777 video_ids.extend(ids_in_page)
1778
1779 if self._MORE_PAGES_INDICATOR not in page:
1780 break
1781 pagenum = pagenum + 1
1782
1783 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1784
1785 for id in video_ids:
1786 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1787 return
1788
1789
1790 class YoutubeUserIE(InfoExtractor):
1791 """Information Extractor for YouTube users."""
1792
1793 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1794 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1795 _GDATA_PAGE_SIZE = 50
1796 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1797 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1798 IE_NAME = u'youtube:user'
1799
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1802
1803 def report_download_page(self, username, start_index):
1804 """Report attempt to download user page."""
1805 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1806 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1807
1808 def _real_extract(self, url):
1809 # Extract username
1810 mobj = re.match(self._VALID_URL, url)
1811 if mobj is None:
1812 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1813 return
1814
1815 username = mobj.group(1)
1816
1817 # Download video ids using YouTube Data API. Result size per
1818 # query is limited (currently to 50 videos) so we need to query
1819 # page by page until there are no video ids - it means we got
1820 # all of them.
1821
1822 video_ids = []
1823 pagenum = 0
1824
1825 while True:
1826 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1827 self.report_download_page(username, start_index)
1828
1829 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1830
1831 try:
1832 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1835 return
1836
1837 # Extract video identifiers
1838 ids_in_page = []
1839
1840 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1841 if mobj.group(1) not in ids_in_page:
1842 ids_in_page.append(mobj.group(1))
1843
1844 video_ids.extend(ids_in_page)
1845
1846 # A little optimization - if current page is not
1847 # "full", ie. does not contain PAGE_SIZE video ids then
1848 # we can assume that this page is the last one - there
1849 # are no more ids on further pages - no need to query
1850 # again.
1851
1852 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1853 break
1854
1855 pagenum += 1
1856
1857 all_ids_count = len(video_ids)
1858 playliststart = self._downloader.params.get('playliststart', 1) - 1
1859 playlistend = self._downloader.params.get('playlistend', -1)
1860
1861 if playlistend == -1:
1862 video_ids = video_ids[playliststart:]
1863 else:
1864 video_ids = video_ids[playliststart:playlistend]
1865
1866 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1867 (username, all_ids_count, len(video_ids)))
1868
1869 for video_id in video_ids:
1870 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1871
1872
1873 class BlipTVUserIE(InfoExtractor):
1874 """Information Extractor for blip.tv users."""
1875
1876 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1877 _PAGE_SIZE = 12
1878 IE_NAME = u'blip.tv:user'
1879
1880 def __init__(self, downloader=None):
1881 InfoExtractor.__init__(self, downloader)
1882
1883 def report_download_page(self, username, pagenum):
1884 """Report attempt to download user page."""
1885 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1886 (self.IE_NAME, username, pagenum))
1887
1888 def _real_extract(self, url):
1889 # Extract username
1890 mobj = re.match(self._VALID_URL, url)
1891 if mobj is None:
1892 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1893 return
1894
1895 username = mobj.group(1)
1896
1897 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1898
1899 request = compat_urllib_request.Request(url)
1900
1901 try:
1902 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1903 mobj = re.search(r'data-users-id="([^"]+)"', page)
1904 page_base = page_base % mobj.group(1)
1905 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1906 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1907 return
1908
1909
1910 # Download video ids using BlipTV Ajax calls. Result size per
1911 # query is limited (currently to 12 videos) so we need to query
1912 # page by page until there are no video ids - it means we got
1913 # all of them.
1914
1915 video_ids = []
1916 pagenum = 1
1917
1918 while True:
1919 self.report_download_page(username, pagenum)
1920
1921 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1922
1923 try:
1924 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1926 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1927 return
1928
1929 # Extract video identifiers
1930 ids_in_page = []
1931
1932 for mobj in re.finditer(r'href="/([^"]+)"', page):
1933 if mobj.group(1) not in ids_in_page:
1934 ids_in_page.append(unescapeHTML(mobj.group(1)))
1935
1936 video_ids.extend(ids_in_page)
1937
1938 # A little optimization - if current page is not
1939 # "full", ie. does not contain PAGE_SIZE video ids then
1940 # we can assume that this page is the last one - there
1941 # are no more ids on further pages - no need to query
1942 # again.
1943
1944 if len(ids_in_page) < self._PAGE_SIZE:
1945 break
1946
1947 pagenum += 1
1948
1949 all_ids_count = len(video_ids)
1950 playliststart = self._downloader.params.get('playliststart', 1) - 1
1951 playlistend = self._downloader.params.get('playlistend', -1)
1952
1953 if playlistend == -1:
1954 video_ids = video_ids[playliststart:]
1955 else:
1956 video_ids = video_ids[playliststart:playlistend]
1957
1958 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1959 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1960
1961 for video_id in video_ids:
1962 self._downloader.download([u'http://blip.tv/'+video_id])
1963
1964
1965 class DepositFilesIE(InfoExtractor):
1966 """Information extractor for depositfiles.com"""
1967
1968 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1969
1970 def report_download_webpage(self, file_id):
1971 """Report webpage download."""
1972 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1973
1974 def report_extraction(self, file_id):
1975 """Report information extraction."""
1976 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1977
1978 def _real_extract(self, url):
1979 file_id = url.split('/')[-1]
1980 # Rebuild url in english locale
1981 url = 'http://depositfiles.com/en/files/' + file_id
1982
1983 # Retrieve file webpage with 'Free download' button pressed
1984 free_download_indication = { 'gateway_result' : '1' }
1985 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1986 try:
1987 self.report_download_webpage(file_id)
1988 webpage = compat_urllib_request.urlopen(request).read()
1989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1991 return
1992
1993 # Search for the real file URL
1994 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1995 if (mobj is None) or (mobj.group(1) is None):
1996 # Try to figure out reason of the error.
1997 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1998 if (mobj is not None) and (mobj.group(1) is not None):
1999 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2000 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2001 else:
2002 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2003 return
2004
2005 file_url = mobj.group(1)
2006 file_extension = os.path.splitext(file_url)[1][1:]
2007
2008 # Search for file title
2009 mobj = re.search(r'<b title="(.*?)">', webpage)
2010 if mobj is None:
2011 self._downloader.trouble(u'ERROR: unable to extract title')
2012 return
2013 file_title = mobj.group(1).decode('utf-8')
2014
2015 return [{
2016 'id': file_id.decode('utf-8'),
2017 'url': file_url.decode('utf-8'),
2018 'uploader': None,
2019 'upload_date': None,
2020 'title': file_title,
2021 'ext': file_extension.decode('utf-8'),
2022 }]
2023
2024
2025 class FacebookIE(InfoExtractor):
2026 """Information Extractor for Facebook"""
2027
2028 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2029 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2030 _NETRC_MACHINE = 'facebook'
2031 IE_NAME = u'facebook'
2032
2033 def report_login(self):
2034 """Report attempt to log in."""
2035 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2036
2037 def _real_initialize(self):
2038 if self._downloader is None:
2039 return
2040
2041 useremail = None
2042 password = None
2043 downloader_params = self._downloader.params
2044
2045 # Attempt to use provided username and password or .netrc data
2046 if downloader_params.get('username', None) is not None:
2047 useremail = downloader_params['username']
2048 password = downloader_params['password']
2049 elif downloader_params.get('usenetrc', False):
2050 try:
2051 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2052 if info is not None:
2053 useremail = info[0]
2054 password = info[2]
2055 else:
2056 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2057 except (IOError, netrc.NetrcParseError) as err:
2058 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2059 return
2060
2061 if useremail is None:
2062 return
2063
2064 # Log in
2065 login_form = {
2066 'email': useremail,
2067 'pass': password,
2068 'login': 'Log+In'
2069 }
2070 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2071 try:
2072 self.report_login()
2073 login_results = compat_urllib_request.urlopen(request).read()
2074 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2076 return
2077 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2078 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2079 return
2080
2081 def _real_extract(self, url):
2082 mobj = re.match(self._VALID_URL, url)
2083 if mobj is None:
2084 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2085 return
2086 video_id = mobj.group('ID')
2087
2088 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2089 webpage = self._download_webpage(url, video_id)
2090
2091 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2092 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2093 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2094 if not m:
2095 raise ExtractorError(u'Cannot parse data')
2096 data = dict(json.loads(m.group(1)))
2097 params_raw = compat_urllib_parse.unquote(data['params'])
2098 params = json.loads(params_raw)
2099 video_url = params['hd_src']
2100 video_duration = int(params['video_duration'])
2101
2102 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2103 if not m:
2104 raise ExtractorError(u'Cannot find title in webpage')
2105 video_title = unescapeHTML(m.group(1))
2106
2107 info = {
2108 'id': video_id,
2109 'title': video_title,
2110 'url': video_url,
2111 'ext': 'mp4',
2112 'duration': video_duration,
2113 'thumbnail': params['thumbnail_src'],
2114 }
2115 return [info]
2116
2117
2118 class BlipTVIE(InfoExtractor):
2119 """Information extractor for blip.tv"""
2120
2121 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2122 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2123 IE_NAME = u'blip.tv'
2124
2125 def report_extraction(self, file_id):
2126 """Report information extraction."""
2127 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2128
2129 def report_direct_download(self, title):
2130 """Report information extraction."""
2131 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2132
2133 def _real_extract(self, url):
2134 mobj = re.match(self._VALID_URL, url)
2135 if mobj is None:
2136 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2137 return
2138
2139 if '?' in url:
2140 cchar = '&'
2141 else:
2142 cchar = '?'
2143 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2144 request = compat_urllib_request.Request(json_url)
2145 request.add_header('User-Agent', 'iTunes/10.6.1')
2146 self.report_extraction(mobj.group(1))
2147 info = None
2148 try:
2149 urlh = compat_urllib_request.urlopen(request)
2150 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2151 basename = url.split('/')[-1]
2152 title,ext = os.path.splitext(basename)
2153 title = title.decode('UTF-8')
2154 ext = ext.replace('.', '')
2155 self.report_direct_download(title)
2156 info = {
2157 'id': title,
2158 'url': url,
2159 'uploader': None,
2160 'upload_date': None,
2161 'title': title,
2162 'ext': ext,
2163 'urlhandle': urlh
2164 }
2165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2166 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2167 if info is None: # Regular URL
2168 try:
2169 json_code_bytes = urlh.read()
2170 json_code = json_code_bytes.decode('utf-8')
2171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2173 return
2174
2175 try:
2176 json_data = json.loads(json_code)
2177 if 'Post' in json_data:
2178 data = json_data['Post']
2179 else:
2180 data = json_data
2181
2182 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2183 video_url = data['media']['url']
2184 umobj = re.match(self._URL_EXT, video_url)
2185 if umobj is None:
2186 raise ValueError('Can not determine filename extension')
2187 ext = umobj.group(1)
2188
2189 info = {
2190 'id': data['item_id'],
2191 'url': video_url,
2192 'uploader': data['display_name'],
2193 'upload_date': upload_date,
2194 'title': data['title'],
2195 'ext': ext,
2196 'format': data['media']['mimeType'],
2197 'thumbnail': data['thumbnailUrl'],
2198 'description': data['description'],
2199 'player_url': data['embedUrl'],
2200 'user_agent': 'iTunes/10.6.1',
2201 }
2202 except (ValueError,KeyError) as err:
2203 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2204 return
2205
2206 return [info]
2207
2208
2209 class MyVideoIE(InfoExtractor):
2210 """Information Extractor for myvideo.de."""
2211
2212 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2213 IE_NAME = u'myvideo'
2214
2215 def __init__(self, downloader=None):
2216 InfoExtractor.__init__(self, downloader)
2217
2218 def report_extraction(self, video_id):
2219 """Report information extraction."""
2220 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2221
2222 def _real_extract(self,url):
2223 mobj = re.match(self._VALID_URL, url)
2224 if mobj is None:
2225 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2226 return
2227
2228 video_id = mobj.group(1)
2229
2230 # Get video webpage
2231 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2232 webpage = self._download_webpage(webpage_url, video_id)
2233
2234 self.report_extraction(video_id)
2235 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2236 webpage)
2237 if mobj is None:
2238 self._downloader.trouble(u'ERROR: unable to extract media URL')
2239 return
2240 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2241
2242 mobj = re.search('<title>([^<]+)</title>', webpage)
2243 if mobj is None:
2244 self._downloader.trouble(u'ERROR: unable to extract title')
2245 return
2246
2247 video_title = mobj.group(1)
2248
2249 return [{
2250 'id': video_id,
2251 'url': video_url,
2252 'uploader': None,
2253 'upload_date': None,
2254 'title': video_title,
2255 'ext': u'flv',
2256 }]
2257
2258 class ComedyCentralIE(InfoExtractor):
2259 """Information extractor for The Daily Show and Colbert Report """
2260
2261 # urls can be abbreviations like :thedailyshow or :colbert
2262 # urls for episodes like:
2263 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2264 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2265 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2266 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2267 |(https?://)?(www\.)?
2268 (?P<showname>thedailyshow|colbertnation)\.com/
2269 (full-episodes/(?P<episode>.*)|
2270 (?P<clip>
2271 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2272 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2273 $"""
2274
2275 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2276
2277 _video_extensions = {
2278 '3500': 'mp4',
2279 '2200': 'mp4',
2280 '1700': 'mp4',
2281 '1200': 'mp4',
2282 '750': 'mp4',
2283 '400': 'mp4',
2284 }
2285 _video_dimensions = {
2286 '3500': '1280x720',
2287 '2200': '960x540',
2288 '1700': '768x432',
2289 '1200': '640x360',
2290 '750': '512x288',
2291 '400': '384x216',
2292 }
2293
2294 def suitable(self, url):
2295 """Receives a URL and returns True if suitable for this IE."""
2296 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2297
2298 def report_extraction(self, episode_id):
2299 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2300
2301 def report_config_download(self, episode_id, media_id):
2302 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2303
2304 def report_index_download(self, episode_id):
2305 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2306
2307 def _print_formats(self, formats):
2308 print('Available formats:')
2309 for x in formats:
2310 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2311
2312
2313 def _real_extract(self, url):
2314 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2315 if mobj is None:
2316 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2317 return
2318
2319 if mobj.group('shortname'):
2320 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2321 url = u'http://www.thedailyshow.com/full-episodes/'
2322 else:
2323 url = u'http://www.colbertnation.com/full-episodes/'
2324 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2325 assert mobj is not None
2326
2327 if mobj.group('clip'):
2328 if mobj.group('showname') == 'thedailyshow':
2329 epTitle = mobj.group('tdstitle')
2330 else:
2331 epTitle = mobj.group('cntitle')
2332 dlNewest = False
2333 else:
2334 dlNewest = not mobj.group('episode')
2335 if dlNewest:
2336 epTitle = mobj.group('showname')
2337 else:
2338 epTitle = mobj.group('episode')
2339
2340 req = compat_urllib_request.Request(url)
2341 self.report_extraction(epTitle)
2342 try:
2343 htmlHandle = compat_urllib_request.urlopen(req)
2344 html = htmlHandle.read()
2345 webpage = html.decode('utf-8')
2346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2347 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2348 return
2349 if dlNewest:
2350 url = htmlHandle.geturl()
2351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352 if mobj is None:
2353 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2354 return
2355 if mobj.group('episode') == '':
2356 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2357 return
2358 epTitle = mobj.group('episode')
2359
2360 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2361
2362 if len(mMovieParams) == 0:
2363 # The Colbert Report embeds the information in a without
2364 # a URL prefix; so extract the alternate reference
2365 # and then add the URL prefix manually.
2366
2367 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2368 if len(altMovieParams) == 0:
2369 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2370 return
2371 else:
2372 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2373
2374 uri = mMovieParams[0][1]
2375 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2376 self.report_index_download(epTitle)
2377 try:
2378 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2379 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2380 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2381 return
2382
2383 results = []
2384
2385 idoc = xml.etree.ElementTree.fromstring(indexXml)
2386 itemEls = idoc.findall('.//item')
2387 for partNum,itemEl in enumerate(itemEls):
2388 mediaId = itemEl.findall('./guid')[0].text
2389 shortMediaId = mediaId.split(':')[-1]
2390 showId = mediaId.split(':')[-2].replace('.com', '')
2391 officialTitle = itemEl.findall('./title')[0].text
2392 officialDate = itemEl.findall('./pubDate')[0].text
2393
2394 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2395 compat_urllib_parse.urlencode({'uri': mediaId}))
2396 configReq = compat_urllib_request.Request(configUrl)
2397 self.report_config_download(epTitle, shortMediaId)
2398 try:
2399 configXml = compat_urllib_request.urlopen(configReq).read()
2400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2402 return
2403
2404 cdoc = xml.etree.ElementTree.fromstring(configXml)
2405 turls = []
2406 for rendition in cdoc.findall('.//rendition'):
2407 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2408 turls.append(finfo)
2409
2410 if len(turls) == 0:
2411 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2412 continue
2413
2414 if self._downloader.params.get('listformats', None):
2415 self._print_formats([i[0] for i in turls])
2416 return
2417
2418 # For now, just pick the highest bitrate
2419 format,rtmp_video_url = turls[-1]
2420
2421 # Get the format arg from the arg stream
2422 req_format = self._downloader.params.get('format', None)
2423
2424 # Select format if we can find one
2425 for f,v in turls:
2426 if f == req_format:
2427 format, rtmp_video_url = f, v
2428 break
2429
2430 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2431 if not m:
2432 raise ExtractorError(u'Cannot transform RTMP url')
2433 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2434 video_url = base + m.group('finalid')
2435
2436 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2437 info = {
2438 'id': shortMediaId,
2439 'url': video_url,
2440 'uploader': showId,
2441 'upload_date': officialDate,
2442 'title': effTitle,
2443 'ext': 'mp4',
2444 'format': format,
2445 'thumbnail': None,
2446 'description': officialTitle,
2447 }
2448 results.append(info)
2449
2450 return results
2451
2452
2453 class EscapistIE(InfoExtractor):
2454 """Information extractor for The Escapist """
2455
2456 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2457 IE_NAME = u'escapist'
2458
2459 def report_extraction(self, showName):
2460 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2461
2462 def report_config_download(self, showName):
2463 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2464
2465 def _real_extract(self, url):
2466 mobj = re.match(self._VALID_URL, url)
2467 if mobj is None:
2468 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2469 return
2470 showName = mobj.group('showname')
2471 videoId = mobj.group('episode')
2472
2473 self.report_extraction(showName)
2474 try:
2475 webPage = compat_urllib_request.urlopen(url)
2476 webPageBytes = webPage.read()
2477 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2478 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2481 return
2482
2483 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2484 description = unescapeHTML(descMatch.group(1))
2485 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2486 imgUrl = unescapeHTML(imgMatch.group(1))
2487 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2488 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2489 configUrlMatch = re.search('config=(.*)$', playerUrl)
2490 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2491
2492 self.report_config_download(showName)
2493 try:
2494 configJSON = compat_urllib_request.urlopen(configUrl)
2495 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2496 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2499 return
2500
2501 # Technically, it's JavaScript, not JSON
2502 configJSON = configJSON.replace("'", '"')
2503
2504 try:
2505 config = json.loads(configJSON)
2506 except (ValueError,) as err:
2507 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2508 return
2509
2510 playlist = config['playlist']
2511 videoUrl = playlist[1]['url']
2512
2513 info = {
2514 'id': videoId,
2515 'url': videoUrl,
2516 'uploader': showName,
2517 'upload_date': None,
2518 'title': showName,
2519 'ext': 'flv',
2520 'thumbnail': imgUrl,
2521 'description': description,
2522 'player_url': playerUrl,
2523 }
2524
2525 return [info]
2526
2527 class CollegeHumorIE(InfoExtractor):
2528 """Information extractor for collegehumor.com"""
2529
2530 _WORKING = False
2531 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2532 IE_NAME = u'collegehumor'
2533
2534 def report_manifest(self, video_id):
2535 """Report information extraction."""
2536 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2537
2538 def report_extraction(self, video_id):
2539 """Report information extraction."""
2540 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2541
2542 def _real_extract(self, url):
2543 mobj = re.match(self._VALID_URL, url)
2544 if mobj is None:
2545 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2546 return
2547 video_id = mobj.group('videoid')
2548
2549 info = {
2550 'id': video_id,
2551 'uploader': None,
2552 'upload_date': None,
2553 }
2554
2555 self.report_extraction(video_id)
2556 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2557 try:
2558 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2561 return
2562
2563 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2564 try:
2565 videoNode = mdoc.findall('./video')[0]
2566 info['description'] = videoNode.findall('./description')[0].text
2567 info['title'] = videoNode.findall('./caption')[0].text
2568 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2569 manifest_url = videoNode.findall('./file')[0].text
2570 except IndexError:
2571 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2572 return
2573
2574 manifest_url += '?hdcore=2.10.3'
2575 self.report_manifest(video_id)
2576 try:
2577 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2580 return
2581
2582 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2583 try:
2584 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2585 node_id = media_node.attrib['url']
2586 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2587 except IndexError as err:
2588 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2589 return
2590
2591 url_pr = compat_urllib_parse_urlparse(manifest_url)
2592 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2593
2594 info['url'] = url
2595 info['ext'] = 'f4f'
2596 return [info]
2597
2598
2599 class XVideosIE(InfoExtractor):
2600 """Information extractor for xvideos.com"""
2601
2602 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2603 IE_NAME = u'xvideos'
2604
2605 def report_extraction(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2608
2609 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2611 if mobj is None:
2612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613 return
2614 video_id = mobj.group(1)
2615
2616 webpage = self._download_webpage(url, video_id)
2617
2618 self.report_extraction(video_id)
2619
2620
2621 # Extract video URL
2622 mobj = re.search(r'flv_url=(.+?)&', webpage)
2623 if mobj is None:
2624 self._downloader.trouble(u'ERROR: unable to extract video url')
2625 return
2626 video_url = compat_urllib_parse.unquote(mobj.group(1))
2627
2628
2629 # Extract title
2630 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2631 if mobj is None:
2632 self._downloader.trouble(u'ERROR: unable to extract video title')
2633 return
2634 video_title = mobj.group(1)
2635
2636
2637 # Extract video thumbnail
2638 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2639 if mobj is None:
2640 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2641 return
2642 video_thumbnail = mobj.group(0)
2643
2644 info = {
2645 'id': video_id,
2646 'url': video_url,
2647 'uploader': None,
2648 'upload_date': None,
2649 'title': video_title,
2650 'ext': 'flv',
2651 'thumbnail': video_thumbnail,
2652 'description': None,
2653 }
2654
2655 return [info]
2656
2657
2658 class SoundcloudIE(InfoExtractor):
2659 """Information extractor for soundcloud.com
2660 To access the media, the uid of the song and a stream token
2661 must be extracted from the page source and the script must make
2662 a request to media.soundcloud.com/crossdomain.xml. Then
2663 the media can be grabbed by requesting from an url composed
2664 of the stream token and uid
2665 """
2666
2667 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2668 IE_NAME = u'soundcloud'
2669
2670 def __init__(self, downloader=None):
2671 InfoExtractor.__init__(self, downloader)
2672
2673 def report_resolve(self, video_id):
2674 """Report information extraction."""
2675 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2676
2677 def report_extraction(self, video_id):
2678 """Report information extraction."""
2679 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2680
2681 def _real_extract(self, url):
2682 mobj = re.match(self._VALID_URL, url)
2683 if mobj is None:
2684 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2685 return
2686
2687 # extract uploader (which is in the url)
2688 uploader = mobj.group(1)
2689 # extract simple title (uploader + slug of song title)
2690 slug_title = mobj.group(2)
2691 simple_title = uploader + u'-' + slug_title
2692
2693 self.report_resolve('%s/%s' % (uploader, slug_title))
2694
2695 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2696 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2697 request = compat_urllib_request.Request(resolv_url)
2698 try:
2699 info_json_bytes = compat_urllib_request.urlopen(request).read()
2700 info_json = info_json_bytes.decode('utf-8')
2701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2702 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2703 return
2704
2705 info = json.loads(info_json)
2706 video_id = info['id']
2707 self.report_extraction('%s/%s' % (uploader, slug_title))
2708
2709 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2710 request = compat_urllib_request.Request(streams_url)
2711 try:
2712 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2713 stream_json = stream_json_bytes.decode('utf-8')
2714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2716 return
2717
2718 streams = json.loads(stream_json)
2719 mediaURL = streams['http_mp3_128_url']
2720
2721 return [{
2722 'id': info['id'],
2723 'url': mediaURL,
2724 'uploader': info['user']['username'],
2725 'upload_date': info['created_at'],
2726 'title': info['title'],
2727 'ext': u'mp3',
2728 'description': info['description'],
2729 }]
2730
2731
2732 class InfoQIE(InfoExtractor):
2733 """Information extractor for infoq.com"""
2734 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2735
2736 def report_extraction(self, video_id):
2737 """Report information extraction."""
2738 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2739
2740 def _real_extract(self, url):
2741 mobj = re.match(self._VALID_URL, url)
2742 if mobj is None:
2743 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2744 return
2745
2746 webpage = self._download_webpage(url, video_id=url)
2747 self.report_extraction(url)
2748
2749 # Extract video URL
2750 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2751 if mobj is None:
2752 self._downloader.trouble(u'ERROR: unable to extract video url')
2753 return
2754 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2755 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2756
2757 # Extract title
2758 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2759 if mobj is None:
2760 self._downloader.trouble(u'ERROR: unable to extract video title')
2761 return
2762 video_title = mobj.group(1)
2763
2764 # Extract description
2765 video_description = u'No description available.'
2766 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2767 if mobj is not None:
2768 video_description = mobj.group(1)
2769
2770 video_filename = video_url.split('/')[-1]
2771 video_id, extension = video_filename.split('.')
2772
2773 info = {
2774 'id': video_id,
2775 'url': video_url,
2776 'uploader': None,
2777 'upload_date': None,
2778 'title': video_title,
2779 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2780 'thumbnail': None,
2781 'description': video_description,
2782 }
2783
2784 return [info]
2785
2786 class MixcloudIE(InfoExtractor):
2787 """Information extractor for www.mixcloud.com"""
2788
2789 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2790 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2791 IE_NAME = u'mixcloud'
2792
2793 def __init__(self, downloader=None):
2794 InfoExtractor.__init__(self, downloader)
2795
2796 def report_download_json(self, file_id):
2797 """Report JSON download."""
2798 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2799
2800 def report_extraction(self, file_id):
2801 """Report information extraction."""
2802 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2803
2804 def get_urls(self, jsonData, fmt, bitrate='best'):
2805 """Get urls from 'audio_formats' section in json"""
2806 file_url = None
2807 try:
2808 bitrate_list = jsonData[fmt]
2809 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2810 bitrate = max(bitrate_list) # select highest
2811
2812 url_list = jsonData[fmt][bitrate]
2813 except TypeError: # we have no bitrate info.
2814 url_list = jsonData[fmt]
2815 return url_list
2816
2817 def check_urls(self, url_list):
2818 """Returns 1st active url from list"""
2819 for url in url_list:
2820 try:
2821 compat_urllib_request.urlopen(url)
2822 return url
2823 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2824 url = None
2825
2826 return None
2827
2828 def _print_formats(self, formats):
2829 print('Available formats:')
2830 for fmt in formats.keys():
2831 for b in formats[fmt]:
2832 try:
2833 ext = formats[fmt][b][0]
2834 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2835 except TypeError: # we have no bitrate info
2836 ext = formats[fmt][0]
2837 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2838 break
2839
2840 def _real_extract(self, url):
2841 mobj = re.match(self._VALID_URL, url)
2842 if mobj is None:
2843 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2844 return
2845 # extract uploader & filename from url
2846 uploader = mobj.group(1).decode('utf-8')
2847 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2848
2849 # construct API request
2850 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2851 # retrieve .json file with links to files
2852 request = compat_urllib_request.Request(file_url)
2853 try:
2854 self.report_download_json(file_url)
2855 jsonData = compat_urllib_request.urlopen(request).read()
2856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2857 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2858 return
2859
2860 # parse JSON
2861 json_data = json.loads(jsonData)
2862 player_url = json_data['player_swf_url']
2863 formats = dict(json_data['audio_formats'])
2864
2865 req_format = self._downloader.params.get('format', None)
2866 bitrate = None
2867
2868 if self._downloader.params.get('listformats', None):
2869 self._print_formats(formats)
2870 return
2871
2872 if req_format is None or req_format == 'best':
2873 for format_param in formats.keys():
2874 url_list = self.get_urls(formats, format_param)
2875 # check urls
2876 file_url = self.check_urls(url_list)
2877 if file_url is not None:
2878 break # got it!
2879 else:
2880 if req_format not in formats:
2881 self._downloader.trouble(u'ERROR: format is not available')
2882 return
2883
2884 url_list = self.get_urls(formats, req_format)
2885 file_url = self.check_urls(url_list)
2886 format_param = req_format
2887
2888 return [{
2889 'id': file_id.decode('utf-8'),
2890 'url': file_url.decode('utf-8'),
2891 'uploader': uploader.decode('utf-8'),
2892 'upload_date': None,
2893 'title': json_data['name'],
2894 'ext': file_url.split('.')[-1].decode('utf-8'),
2895 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2896 'thumbnail': json_data['thumbnail_url'],
2897 'description': json_data['description'],
2898 'player_url': player_url.decode('utf-8'),
2899 }]
2900
2901 class StanfordOpenClassroomIE(InfoExtractor):
2902 """Information extractor for Stanford's Open ClassRoom"""
2903
2904 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2905 IE_NAME = u'stanfordoc'
2906
2907 def report_download_webpage(self, objid):
2908 """Report information extraction."""
2909 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2910
2911 def report_extraction(self, video_id):
2912 """Report information extraction."""
2913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2914
2915 def _real_extract(self, url):
2916 mobj = re.match(self._VALID_URL, url)
2917 if mobj is None:
2918 raise ExtractorError(u'Invalid URL: %s' % url)
2919
2920 if mobj.group('course') and mobj.group('video'): # A specific video
2921 course = mobj.group('course')
2922 video = mobj.group('video')
2923 info = {
2924 'id': course + '_' + video,
2925 'uploader': None,
2926 'upload_date': None,
2927 }
2928
2929 self.report_extraction(info['id'])
2930 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2931 xmlUrl = baseUrl + video + '.xml'
2932 try:
2933 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2934 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2935 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2936 return
2937 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2938 try:
2939 info['title'] = mdoc.findall('./title')[0].text
2940 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2941 except IndexError:
2942 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2943 return
2944 info['ext'] = info['url'].rpartition('.')[2]
2945 return [info]
2946 elif mobj.group('course'): # A course page
2947 course = mobj.group('course')
2948 info = {
2949 'id': course,
2950 'type': 'playlist',
2951 'uploader': None,
2952 'upload_date': None,
2953 }
2954
2955 coursepage = self._download_webpage(url, info['id'],
2956 note='Downloading course info page',
2957 errnote='Unable to download course info page')
2958
2959 m = re.search('<h1>([^<]+)</h1>', coursepage)
2960 if m:
2961 info['title'] = unescapeHTML(m.group(1))
2962 else:
2963 info['title'] = info['id']
2964
2965 m = re.search('<description>([^<]+)</description>', coursepage)
2966 if m:
2967 info['description'] = unescapeHTML(m.group(1))
2968
2969 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2970 info['list'] = [
2971 {
2972 'type': 'reference',
2973 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2974 }
2975 for vpage in links]
2976 results = []
2977 for entry in info['list']:
2978 assert entry['type'] == 'reference'
2979 results += self.extract(entry['url'])
2980 return results
2981 else: # Root page
2982 info = {
2983 'id': 'Stanford OpenClassroom',
2984 'type': 'playlist',
2985 'uploader': None,
2986 'upload_date': None,
2987 }
2988
2989 self.report_download_webpage(info['id'])
2990 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2991 try:
2992 rootpage = compat_urllib_request.urlopen(rootURL).read()
2993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2994 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2995 return
2996
2997 info['title'] = info['id']
2998
2999 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3000 info['list'] = [
3001 {
3002 'type': 'reference',
3003 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3004 }
3005 for cpage in links]
3006
3007 results = []
3008 for entry in info['list']:
3009 assert entry['type'] == 'reference'
3010 results += self.extract(entry['url'])
3011 return results
3012
3013 class MTVIE(InfoExtractor):
3014 """Information extractor for MTV.com"""
3015
3016 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3017 IE_NAME = u'mtv'
3018
3019 def report_extraction(self, video_id):
3020 """Report information extraction."""
3021 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3022
3023 def _real_extract(self, url):
3024 mobj = re.match(self._VALID_URL, url)
3025 if mobj is None:
3026 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3027 return
3028 if not mobj.group('proto'):
3029 url = 'http://' + url
3030 video_id = mobj.group('videoid')
3031
3032 webpage = self._download_webpage(url, video_id)
3033
3034 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3035 if mobj is None:
3036 self._downloader.trouble(u'ERROR: unable to extract song name')
3037 return
3038 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3039 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3040 if mobj is None:
3041 self._downloader.trouble(u'ERROR: unable to extract performer')
3042 return
3043 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3044 video_title = performer + ' - ' + song_name
3045
3046 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3047 if mobj is None:
3048 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3049 return
3050 mtvn_uri = mobj.group(1)
3051
3052 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3053 if mobj is None:
3054 self._downloader.trouble(u'ERROR: unable to extract content id')
3055 return
3056 content_id = mobj.group(1)
3057
3058 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3059 self.report_extraction(video_id)
3060 request = compat_urllib_request.Request(videogen_url)
3061 try:
3062 metadataXml = compat_urllib_request.urlopen(request).read()
3063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3064 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3065 return
3066
3067 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3068 renditions = mdoc.findall('.//rendition')
3069
3070 # For now, always pick the highest quality.
3071 rendition = renditions[-1]
3072
3073 try:
3074 _,_,ext = rendition.attrib['type'].partition('/')
3075 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3076 video_url = rendition.find('./src').text
3077 except KeyError:
3078 self._downloader.trouble('Invalid rendition field.')
3079 return
3080
3081 info = {
3082 'id': video_id,
3083 'url': video_url,
3084 'uploader': performer,
3085 'upload_date': None,
3086 'title': video_title,
3087 'ext': ext,
3088 'format': format,
3089 }
3090
3091 return [info]
3092
3093
3094 class YoukuIE(InfoExtractor):
3095 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3096
3097 def report_download_webpage(self, file_id):
3098 """Report webpage download."""
3099 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3100
3101 def report_extraction(self, file_id):
3102 """Report information extraction."""
3103 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3104
3105 def _gen_sid(self):
3106 nowTime = int(time.time() * 1000)
3107 random1 = random.randint(1000,1998)
3108 random2 = random.randint(1000,9999)
3109
3110 return "%d%d%d" %(nowTime,random1,random2)
3111
3112 def _get_file_ID_mix_string(self, seed):
3113 mixed = []
3114 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3115 seed = float(seed)
3116 for i in range(len(source)):
3117 seed = (seed * 211 + 30031 ) % 65536
3118 index = math.floor(seed / 65536 * len(source) )
3119 mixed.append(source[int(index)])
3120 source.remove(source[int(index)])
3121 #return ''.join(mixed)
3122 return mixed
3123
3124 def _get_file_id(self, fileId, seed):
3125 mixed = self._get_file_ID_mix_string(seed)
3126 ids = fileId.split('*')
3127 realId = []
3128 for ch in ids:
3129 if ch:
3130 realId.append(mixed[int(ch)])
3131 return ''.join(realId)
3132
3133 def _real_extract(self, url):
3134 mobj = re.match(self._VALID_URL, url)
3135 if mobj is None:
3136 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3137 return
3138 video_id = mobj.group('ID')
3139
3140 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3141
3142 request = compat_urllib_request.Request(info_url, None, std_headers)
3143 try:
3144 self.report_download_webpage(video_id)
3145 jsondata = compat_urllib_request.urlopen(request).read()
3146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3148 return
3149
3150 self.report_extraction(video_id)
3151 try:
3152 jsonstr = jsondata.decode('utf-8')
3153 config = json.loads(jsonstr)
3154
3155 video_title = config['data'][0]['title']
3156 seed = config['data'][0]['seed']
3157
3158 format = self._downloader.params.get('format', None)
3159 supported_format = list(config['data'][0]['streamfileids'].keys())
3160
3161 if format is None or format == 'best':
3162 if 'hd2' in supported_format:
3163 format = 'hd2'
3164 else:
3165 format = 'flv'
3166 ext = u'flv'
3167 elif format == 'worst':
3168 format = 'mp4'
3169 ext = u'mp4'
3170 else:
3171 format = 'flv'
3172 ext = u'flv'
3173
3174
3175 fileid = config['data'][0]['streamfileids'][format]
3176 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3177 except (UnicodeDecodeError, ValueError, KeyError):
3178 self._downloader.trouble(u'ERROR: unable to extract info section')
3179 return
3180
3181 files_info=[]
3182 sid = self._gen_sid()
3183 fileid = self._get_file_id(fileid, seed)
3184
3185 #column 8,9 of fileid represent the segment number
3186 #fileid[7:9] should be changed
3187 for index, key in enumerate(keys):
3188
3189 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3190 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3191
3192 info = {
3193 'id': '%s_part%02d' % (video_id, index),
3194 'url': download_url,
3195 'uploader': None,
3196 'upload_date': None,
3197 'title': video_title,
3198 'ext': ext,
3199 }
3200 files_info.append(info)
3201
3202 return files_info
3203
3204
3205 class XNXXIE(InfoExtractor):
3206 """Information extractor for xnxx.com"""
3207
3208 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3209 IE_NAME = u'xnxx'
3210 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3211 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3212 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3213
3214 def report_webpage(self, video_id):
3215 """Report information extraction"""
3216 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3217
3218 def report_extraction(self, video_id):
3219 """Report information extraction"""
3220 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3221
3222 def _real_extract(self, url):
3223 mobj = re.match(self._VALID_URL, url)
3224 if mobj is None:
3225 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3226 return
3227 video_id = mobj.group(1)
3228
3229 self.report_webpage(video_id)
3230
3231 # Get webpage content
3232 try:
3233 webpage_bytes = compat_urllib_request.urlopen(url).read()
3234 webpage = webpage_bytes.decode('utf-8')
3235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3237 return
3238
3239 result = re.search(self.VIDEO_URL_RE, webpage)
3240 if result is None:
3241 self._downloader.trouble(u'ERROR: unable to extract video url')
3242 return
3243 video_url = compat_urllib_parse.unquote(result.group(1))
3244
3245 result = re.search(self.VIDEO_TITLE_RE, webpage)
3246 if result is None:
3247 self._downloader.trouble(u'ERROR: unable to extract video title')
3248 return
3249 video_title = result.group(1)
3250
3251 result = re.search(self.VIDEO_THUMB_RE, webpage)
3252 if result is None:
3253 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3254 return
3255 video_thumbnail = result.group(1)
3256
3257 return [{
3258 'id': video_id,
3259 'url': video_url,
3260 'uploader': None,
3261 'upload_date': None,
3262 'title': video_title,
3263 'ext': 'flv',
3264 'thumbnail': video_thumbnail,
3265 'description': None,
3266 }]
3267
3268
3269 class GooglePlusIE(InfoExtractor):
3270 """Information extractor for plus.google.com."""
3271
3272 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3273 IE_NAME = u'plus.google'
3274
3275 def __init__(self, downloader=None):
3276 InfoExtractor.__init__(self, downloader)
3277
3278 def report_extract_entry(self, url):
3279 """Report downloading extry"""
3280 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3281
3282 def report_date(self, upload_date):
3283 """Report downloading extry"""
3284 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3285
3286 def report_uploader(self, uploader):
3287 """Report downloading extry"""
3288 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3289
3290 def report_title(self, video_title):
3291 """Report downloading extry"""
3292 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3293
3294 def report_extract_vid_page(self, video_page):
3295 """Report information extraction."""
3296 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3297
3298 def _real_extract(self, url):
3299 # Extract id from URL
3300 mobj = re.match(self._VALID_URL, url)
3301 if mobj is None:
3302 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3303 return
3304
3305 post_url = mobj.group(0)
3306 video_id = mobj.group(1)
3307
3308 video_extension = 'flv'
3309
3310 # Step 1, Retrieve post webpage to extract further information
3311 self.report_extract_entry(post_url)
3312 request = compat_urllib_request.Request(post_url)
3313 try:
3314 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3316 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3317 return
3318
3319 # Extract update date
3320 upload_date = None
3321 pattern = 'title="Timestamp">(.*?)</a>'
3322 mobj = re.search(pattern, webpage)
3323 if mobj:
3324 upload_date = mobj.group(1)
3325 # Convert timestring to a format suitable for filename
3326 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3327 upload_date = upload_date.strftime('%Y%m%d')
3328 self.report_date(upload_date)
3329
3330 # Extract uploader
3331 uploader = None
3332 pattern = r'rel\="author".*?>(.*?)</a>'
3333 mobj = re.search(pattern, webpage)
3334 if mobj:
3335 uploader = mobj.group(1)
3336 self.report_uploader(uploader)
3337
3338 # Extract title
3339 # Get the first line for title
3340 video_title = u'NA'
3341 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3342 mobj = re.search(pattern, webpage)
3343 if mobj:
3344 video_title = mobj.group(1)
3345 self.report_title(video_title)
3346
3347 # Step 2, Stimulate clicking the image box to launch video
3348 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3349 mobj = re.search(pattern, webpage)
3350 if mobj is None:
3351 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3352
3353 video_page = mobj.group(1)
3354 request = compat_urllib_request.Request(video_page)
3355 try:
3356 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3357 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3358 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3359 return
3360 self.report_extract_vid_page(video_page)
3361
3362
3363 # Extract video links on video page
3364 """Extract video links of all sizes"""
3365 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3366 mobj = re.findall(pattern, webpage)
3367 if len(mobj) == 0:
3368 self._downloader.trouble(u'ERROR: unable to extract video links')
3369
3370 # Sort in resolution
3371 links = sorted(mobj)
3372
3373 # Choose the lowest of the sort, i.e. highest resolution
3374 video_url = links[-1]
3375 # Only get the url. The resolution part in the tuple has no use anymore
3376 video_url = video_url[-1]
3377 # Treat escaped \u0026 style hex
3378 try:
3379 video_url = video_url.decode("unicode_escape")
3380 except AttributeError: # Python 3
3381 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3382
3383
3384 return [{
3385 'id': video_id,
3386 'url': video_url,
3387 'uploader': uploader,
3388 'upload_date': upload_date,
3389 'title': video_title,
3390 'ext': video_extension,
3391 }]
3392
3393 class NBAIE(InfoExtractor):
3394 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3395 IE_NAME = u'nba'
3396
3397 def _real_extract(self, url):
3398 mobj = re.match(self._VALID_URL, url)
3399 if mobj is None:
3400 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3401 return
3402
3403 video_id = mobj.group(1)
3404 if video_id.endswith('/index.html'):
3405 video_id = video_id[:-len('/index.html')]
3406
3407 webpage = self._download_webpage(url, video_id)
3408
3409 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3410 def _findProp(rexp, default=None):
3411 m = re.search(rexp, webpage)
3412 if m:
3413 return unescapeHTML(m.group(1))
3414 else:
3415 return default
3416
3417 shortened_video_id = video_id.rpartition('/')[2]
3418 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3419 info = {
3420 'id': shortened_video_id,
3421 'url': video_url,
3422 'ext': 'mp4',
3423 'title': title,
3424 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3425 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3426 }
3427 return [info]
3428
3429 class JustinTVIE(InfoExtractor):
3430 """Information extractor for justin.tv and twitch.tv"""
3431 # TODO: One broadcast may be split into multiple videos. The key
3432 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3433 # starts at 1 and increases. Can we treat all parts as one video?
3434
3435 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3436 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3437 _JUSTIN_PAGE_LIMIT = 100
3438 IE_NAME = u'justin.tv'
3439
3440 def report_extraction(self, file_id):
3441 """Report information extraction."""
3442 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3443
3444 def report_download_page(self, channel, offset):
3445 """Report attempt to download a single page of videos."""
3446 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3447 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3448
3449 # Return count of items, list of *valid* items
3450 def _parse_page(self, url):
3451 try:
3452 urlh = compat_urllib_request.urlopen(url)
3453 webpage_bytes = urlh.read()
3454 webpage = webpage_bytes.decode('utf-8', 'ignore')
3455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3456 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3457 return
3458
3459 response = json.loads(webpage)
3460 if type(response) != list:
3461 error_text = response.get('error', 'unknown error')
3462 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3463 return
3464 info = []
3465 for clip in response:
3466 video_url = clip['video_file_url']
3467 if video_url:
3468 video_extension = os.path.splitext(video_url)[1][1:]
3469 video_date = re.sub('-', '', clip['start_time'][:10])
3470 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3471 video_id = clip['id']
3472 video_title = clip.get('title', video_id)
3473 info.append({
3474 'id': video_id,
3475 'url': video_url,
3476 'title': video_title,
3477 'uploader': clip.get('channel_name', video_uploader_id),
3478 'uploader_id': video_uploader_id,
3479 'upload_date': video_date,
3480 'ext': video_extension,
3481 })
3482 return (len(response), info)
3483
3484 def _real_extract(self, url):
3485 mobj = re.match(self._VALID_URL, url)
3486 if mobj is None:
3487 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3488 return
3489
3490 api = 'http://api.justin.tv'
3491 video_id = mobj.group(mobj.lastindex)
3492 paged = False
3493 if mobj.lastindex == 1:
3494 paged = True
3495 api += '/channel/archives/%s.json'
3496 else:
3497 api += '/broadcast/by_archive/%s.json'
3498 api = api % (video_id,)
3499
3500 self.report_extraction(video_id)
3501
3502 info = []
3503 offset = 0
3504 limit = self._JUSTIN_PAGE_LIMIT
3505 while True:
3506 if paged:
3507 self.report_download_page(video_id, offset)
3508 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3509 page_count, page_info = self._parse_page(page_url)
3510 info.extend(page_info)
3511 if not paged or page_count != limit:
3512 break
3513 offset += limit
3514 return info
3515
3516 class FunnyOrDieIE(InfoExtractor):
3517 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3518
3519 def _real_extract(self, url):
3520 mobj = re.match(self._VALID_URL, url)
3521 if mobj is None:
3522 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3523 return
3524
3525 video_id = mobj.group('id')
3526 webpage = self._download_webpage(url, video_id)
3527
3528 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3529 if not m:
3530 self._downloader.trouble(u'ERROR: unable to find video information')
3531 video_url = unescapeHTML(m.group('url'))
3532
3533 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3534 if not m:
3535 self._downloader.trouble(u'Cannot find video title')
3536 title = unescapeHTML(m.group('title'))
3537
3538 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3539 if m:
3540 desc = unescapeHTML(m.group('desc'))
3541 else:
3542 desc = None
3543
3544 info = {
3545 'id': video_id,
3546 'url': video_url,
3547 'ext': 'mp4',
3548 'title': title,
3549 'description': desc,
3550 }
3551 return [info]
3552
3553 class TweetReelIE(InfoExtractor):
3554 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3555
3556 def _real_extract(self, url):
3557 mobj = re.match(self._VALID_URL, url)
3558 if mobj is None:
3559 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3560 return
3561
3562 video_id = mobj.group('id')
3563 webpage = self._download_webpage(url, video_id)
3564
3565 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3566 if not m:
3567 self._downloader.trouble(u'ERROR: Cannot find status ID')
3568 status_id = m.group(1)
3569
3570 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3571 if not m:
3572 self._downloader.trouble(u'WARNING: Cannot find description')
3573 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3574
3575 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3576 if not m:
3577 self._downloader.trouble(u'ERROR: Cannot find uploader')
3578 uploader = unescapeHTML(m.group('uploader'))
3579 uploader_id = unescapeHTML(m.group('uploader_id'))
3580
3581 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3582 if not m:
3583 self._downloader.trouble(u'ERROR: Cannot find upload date')
3584 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3585
3586 title = desc
3587 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3588
3589 info = {
3590 'id': video_id,
3591 'url': video_url,
3592 'ext': 'mov',
3593 'title': title,
3594 'description': desc,
3595 'uploader': uploader,
3596 'uploader_id': uploader_id,
3597 'internal_id': status_id,
3598 'upload_date': upload_date
3599 }
3600 return [info]
3601
3602 class SteamIE(InfoExtractor):
3603 _VALID_URL = r"""http://store.steampowered.com/
3604 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3605 (?P<gameID>\d+)/?
3606 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3607 """
3608
3609 def suitable(self, url):
3610 """Receives a URL and returns True if suitable for this IE."""
3611 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3612
3613 def _real_extract(self, url):
3614 m = re.match(self._VALID_URL, url, re.VERBOSE)
3615 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3616 gameID = m.group('gameID')
3617 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3618 webpage = self._download_webpage(videourl, gameID)
3619 mweb = re.finditer(urlRE, webpage)
3620 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3621 titles = re.finditer(namesRE, webpage)
3622 videos = []
3623 for vid,vtitle in zip(mweb,titles):
3624 video_id = vid.group('videoID')
3625 title = vtitle.group('videoName')
3626 video_url = vid.group('videoURL')
3627 if not video_url:
3628 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3629 info = {
3630 'id':video_id,
3631 'url':video_url,
3632 'ext': 'flv',
3633 'title': unescapeHTML(title)
3634 }
3635 videos.append(info)
3636 return videos
3637
3638 class UstreamIE(InfoExtractor):
3639 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3640 IE_NAME = u'ustream'
3641
3642 def _real_extract(self, url):
3643 m = re.match(self._VALID_URL, url)
3644 video_id = m.group('videoID')
3645 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3646 webpage = self._download_webpage(url, video_id)
3647 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3648 title = m.group('title')
3649 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3650 uploader = m.group('uploader')
3651 info = {
3652 'id':video_id,
3653 'url':video_url,
3654 'ext': 'flv',
3655 'title': title,
3656 'uploader': uploader
3657 }
3658 return [info]
3659
3660 class RBMARadioIE(InfoExtractor):
3661 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3662
3663 def _real_extract(self, url):
3664 m = re.match(self._VALID_URL, url)
3665 video_id = m.group('videoID')
3666
3667 webpage = self._download_webpage(url, video_id)
3668 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3669 if not m:
3670 raise ExtractorError(u'Cannot find metadata')
3671 json_data = m.group(1)
3672
3673 try:
3674 data = json.loads(json_data)
3675 except ValueError as e:
3676 raise ExtractorError(u'Invalid JSON: ' + str(e))
3677
3678 video_url = data['akamai_url'] + '&cbr=256'
3679 url_parts = compat_urllib_parse_urlparse(video_url)
3680 video_ext = url_parts.path.rpartition('.')[2]
3681 info = {
3682 'id': video_id,
3683 'url': video_url,
3684 'ext': video_ext,
3685 'title': data['title'],
3686 'description': data.get('teaser_text'),
3687 'location': data.get('country_of_origin'),
3688 'uploader': data.get('host', {}).get('name'),
3689 'uploader_id': data.get('host', {}).get('slug'),
3690 'thumbnail': data.get('image', {}).get('large_url_2x'),
3691 'duration': data.get('duration'),
3692 }
3693 return [info]
3694
3695
3696 class YouPornIE(InfoExtractor):
3697 """Information extractor for youporn.com."""
3698 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3699
3700 def _print_formats(self, formats):
3701 """Print all available formats"""
3702 print(u'Available formats:')
3703 print(u'ext\t\tformat')
3704 print(u'---------------------------------')
3705 for format in formats:
3706 print(u'%s\t\t%s' % (format['ext'], format['format']))
3707
3708 def _specific(self, req_format, formats):
3709 for x in formats:
3710 if(x["format"]==req_format):
3711 return x
3712 return None
3713
3714 def _real_extract(self, url):
3715 mobj = re.match(self._VALID_URL, url)
3716 if mobj is None:
3717 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3718 return
3719
3720 video_id = mobj.group('videoid')
3721
3722 req = compat_urllib_request.Request(url)
3723 req.add_header('Cookie', 'age_verified=1')
3724 webpage = self._download_webpage(req, video_id)
3725
3726 # Get the video title
3727 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3728 if result is None:
3729 raise ExtractorError(u'ERROR: unable to extract video title')
3730 video_title = result.group('title').strip()
3731
3732 # Get the video date
3733 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3734 if result is None:
3735 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3736 upload_date = None
3737 else:
3738 upload_date = result.group('date').strip()
3739
3740 # Get the video uploader
3741 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3742 if result is None:
3743 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3744 video_uploader = None
3745 else:
3746 video_uploader = result.group('uploader').strip()
3747 video_uploader = clean_html( video_uploader )
3748
3749 # Get all of the formats available
3750 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3751 result = re.search(DOWNLOAD_LIST_RE, webpage)
3752 if result is None:
3753 raise ExtractorError(u'Unable to extract download list')
3754 download_list_html = result.group('download_list').strip()
3755
3756 # Get all of the links from the page
3757 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3758 links = re.findall(LINK_RE, download_list_html)
3759 if(len(links) == 0):
3760 raise ExtractorError(u'ERROR: no known formats available for video')
3761
3762 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3763
3764 formats = []
3765 for link in links:
3766
3767 # A link looks like this:
3768 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3769 # A path looks like this:
3770 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3771 video_url = unescapeHTML( link )
3772 path = compat_urllib_parse_urlparse( video_url ).path
3773 extension = os.path.splitext( path )[1][1:]
3774 format = path.split('/')[4].split('_')[:2]
3775 size = format[0]
3776 bitrate = format[1]
3777 format = "-".join( format )
3778 title = u'%s-%s-%s' % (video_title, size, bitrate)
3779
3780 formats.append({
3781 'id': video_id,
3782 'url': video_url,
3783 'uploader': video_uploader,
3784 'upload_date': upload_date,
3785 'title': title,
3786 'ext': extension,
3787 'format': format,
3788 'thumbnail': None,
3789 'description': None,
3790 'player_url': None
3791 })
3792
3793 if self._downloader.params.get('listformats', None):
3794 self._print_formats(formats)
3795 return
3796
3797 req_format = self._downloader.params.get('format', None)
3798 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3799
3800 if req_format is None or req_format == 'best':
3801 return [formats[0]]
3802 elif req_format == 'worst':
3803 return [formats[-1]]
3804 elif req_format in ('-1', 'all'):
3805 return formats
3806 else:
3807 format = self._specific( req_format, formats )
3808 if result is None:
3809 self._downloader.trouble(u'ERROR: requested format not available')
3810 return
3811 return [format]
3812
3813
3814
3815 class PornotubeIE(InfoExtractor):
3816 """Information extractor for pornotube.com."""
3817 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3818
3819 def _real_extract(self, url):
3820 mobj = re.match(self._VALID_URL, url)
3821 if mobj is None:
3822 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3823 return
3824
3825 video_id = mobj.group('videoid')
3826 video_title = mobj.group('title')
3827
3828 # Get webpage content
3829 webpage = self._download_webpage(url, video_id)
3830
3831 # Get the video URL
3832 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3833 result = re.search(VIDEO_URL_RE, webpage)
3834 if result is None:
3835 self._downloader.trouble(u'ERROR: unable to extract video url')
3836 return
3837 video_url = compat_urllib_parse.unquote(result.group('url'))
3838
3839 #Get the uploaded date
3840 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3841 result = re.search(VIDEO_UPLOADED_RE, webpage)
3842 if result is None:
3843 self._downloader.trouble(u'ERROR: unable to extract video title')
3844 return
3845 upload_date = result.group('date')
3846
3847 info = {'id': video_id,
3848 'url': video_url,
3849 'uploader': None,
3850 'upload_date': upload_date,
3851 'title': video_title,
3852 'ext': 'flv',
3853 'format': 'flv'}
3854
3855 return [info]
3856
3857 class YouJizzIE(InfoExtractor):
3858 """Information extractor for youjizz.com."""
3859 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3860
3861 def _real_extract(self, url):
3862 mobj = re.match(self._VALID_URL, url)
3863 if mobj is None:
3864 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3865 return
3866
3867 video_id = mobj.group('videoid')
3868
3869 # Get webpage content
3870 webpage = self._download_webpage(url, video_id)
3871
3872 # Get the video title
3873 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3874 if result is None:
3875 raise ExtractorError(u'ERROR: unable to extract video title')
3876 video_title = result.group('title').strip()
3877
3878 # Get the embed page
3879 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3880 if result is None:
3881 raise ExtractorError(u'ERROR: unable to extract embed page')
3882
3883 embed_page_url = result.group(0).strip()
3884 video_id = result.group('videoid')
3885
3886 webpage = self._download_webpage(embed_page_url, video_id)
3887
3888 # Get the video URL
3889 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3890 if result is None:
3891 raise ExtractorError(u'ERROR: unable to extract video url')
3892 video_url = result.group('source')
3893
3894 info = {'id': video_id,
3895 'url': video_url,
3896 'title': video_title,
3897 'ext': 'flv',
3898 'format': 'flv',
3899 'player_url': embed_page_url}
3900
3901 return [info]
3902
3903 class EightTracksIE(InfoExtractor):
3904 IE_NAME = '8tracks'
3905 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3906
3907 def _real_extract(self, url):
3908 mobj = re.match(self._VALID_URL, url)
3909 if mobj is None:
3910 raise ExtractorError(u'Invalid URL: %s' % url)
3911 playlist_id = mobj.group('id')
3912
3913 webpage = self._download_webpage(url, playlist_id)
3914
3915 m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3916 if not m:
3917 raise ExtractorError(u'Cannot find trax information')
3918 json_like = m.group(1)
3919 data = json.loads(json_like)
3920
3921 session = str(random.randint(0, 1000000000))
3922 mix_id = data['id']
3923 track_count = data['tracks_count']
3924 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3925 next_url = first_url
3926 res = []
3927 for i in itertools.count():
3928 api_json = self._download_webpage(next_url, playlist_id,
3929 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3930 errnote=u'Failed to download song information')
3931 api_data = json.loads(api_json)
3932 track_data = api_data[u'set']['track']
3933 info = {
3934 'id': track_data['id'],
3935 'url': track_data['track_file_stream_url'],
3936 'title': track_data['performer'] + u' - ' + track_data['name'],
3937 'raw_title': track_data['name'],
3938 'uploader_id': data['user']['login'],
3939 'ext': 'm4a',
3940 }
3941 res.append(info)
3942 if api_data['set']['at_last_track']:
3943 break
3944 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3945 return res
3946
3947 class KeekIE(InfoExtractor):
3948 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3949 IE_NAME = u'keek'
3950
3951 def _real_extract(self, url):
3952 m = re.match(self._VALID_URL, url)
3953 video_id = m.group('videoID')
3954 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3955 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3956 webpage = self._download_webpage(url, video_id)
3957 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3958 title = unescapeHTML(m.group('title'))
3959 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3960 uploader = unescapeHTML(m.group('uploader'))
3961 info = {
3962 'id':video_id,
3963 'url':video_url,
3964 'ext': 'mp4',
3965 'title': title,
3966 'thumbnail': thumbnail,
3967 'uploader': uploader
3968 }
3969 return [info]
3970
3971 class TEDIE(InfoExtractor):
3972 _VALID_URL=r'http://www.ted.com/talks/(?P<videoName>\w+)'
3973 def _real_extract(self, url):
3974 m=re.match(self._VALID_URL, url)
3975 videoName=m.group('videoName')
3976 webpage=self._download_webpage(url, 0, 'Downloading \"%s\" page' % videoName)
3977 #If the url includes the language we get the title translated
3978 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
3979 title=re.search(title_RE, webpage).group('title')
3980 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3981 "id":(?P<videoID>[\d]+).*?
3982 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3983 info_match=re.search(info_RE,webpage,re.VERBOSE)
3984 video_id=info_match.group('videoID')
3985 mediaSlug=info_match.group('mediaSlug')
3986 video_url='http://download.ted.com/talks/%s.mp4' % mediaSlug
3987 info = {
3988 'id':video_id,
3989 'url':video_url,
3990 'ext': 'mp4',
3991 'title': title
3992 }
3993 return [info]
3994
3995 class MySpassIE(InfoExtractor):
3996 _VALID_URL = r'http://www.myspass.de/.*'
3997
3998 def _real_extract(self, url):
3999 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4000
4001 # video id is the last path element of the URL
4002 # usually there is a trailing slash, so also try the second but last
4003 url_path = compat_urllib_parse_urlparse(url).path
4004 url_parent_path, video_id = os.path.split(url_path)
4005 if not video_id:
4006 _, video_id = os.path.split(url_parent_path)
4007
4008 # get metadata
4009 metadata_url = META_DATA_URL_TEMPLATE % video_id
4010 metadata_text = self._download_webpage(metadata_url, video_id)
4011 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4012
4013 # extract values from metadata
4014 url_flv_el = metadata.find('url_flv')
4015 if url_flv_el is None:
4016 self._downloader.trouble(u'ERROR: unable to extract download url')
4017 return
4018 video_url = url_flv_el.text
4019 extension = os.path.splitext(video_url)[1][1:]
4020 title_el = metadata.find('title')
4021 if title_el is None:
4022 self._downloader.trouble(u'ERROR: unable to extract title')
4023 return
4024 title = title_el.text
4025 format_id_el = metadata.find('format_id')
4026 if format_id_el is None:
4027 format = ext
4028 else:
4029 format = format_id_el.text
4030 description_el = metadata.find('description')
4031 if description_el is not None:
4032 description = description_el.text
4033 else:
4034 description = None
4035 imagePreview_el = metadata.find('imagePreview')
4036 if imagePreview_el is not None:
4037 thumbnail = imagePreview_el.text
4038 else:
4039 thumbnail = None
4040 info = {
4041 'id': video_id,
4042 'url': video_url,
4043 'title': title,
4044 'ext': extension,
4045 'format': format,
4046 'thumbnail': thumbnail,
4047 'description': description
4048 }
4049 return [info]
4050
4051 def gen_extractors():
4052 """ Return a list of an instance of every supported extractor.
4053 The order does matter; the first extractor matched is the one handling the URL.
4054 """
4055 return [
4056 YoutubePlaylistIE(),
4057 YoutubeChannelIE(),
4058 YoutubeUserIE(),
4059 YoutubeSearchIE(),
4060 YoutubeIE(),
4061 MetacafeIE(),
4062 DailymotionIE(),
4063 GoogleSearchIE(),
4064 PhotobucketIE(),
4065 YahooIE(),
4066 YahooSearchIE(),
4067 DepositFilesIE(),
4068 FacebookIE(),
4069 BlipTVUserIE(),
4070 BlipTVIE(),
4071 VimeoIE(),
4072 MyVideoIE(),
4073 ComedyCentralIE(),
4074 EscapistIE(),
4075 CollegeHumorIE(),
4076 XVideosIE(),
4077 SoundcloudIE(),
4078 InfoQIE(),
4079 MixcloudIE(),
4080 StanfordOpenClassroomIE(),
4081 MTVIE(),
4082 YoukuIE(),
4083 XNXXIE(),
4084 YouJizzIE(),
4085 PornotubeIE(),
4086 YouPornIE(),
4087 GooglePlusIE(),
4088 ArteTvIE(),
4089 NBAIE(),
4090 JustinTVIE(),
4091 FunnyOrDieIE(),
4092 TweetReelIE(),
4093 SteamIE(),
4094 UstreamIE(),
4095 RBMARadioIE(),
4096 EightTracksIE(),
4097 KeekIE(),
4098 TEDIE(),
4099 MySpassIE(),
4100 GenericIE()
4101 ]
4102
4103