]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Py2/3 parse_qs compatibility
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import netrc
6 import os
7 import re
8 import socket
9 import time
10 import email.utils
11 import xml.etree.ElementTree
12 import random
13 import math
14
15 from utils import *
16
17
18 class InfoExtractor(object):
19 """Information Extractor class.
20
21 Information extractors are the classes that, given a URL, extract
22 information about the video (or videos) the URL refers to. This
23 information includes the real video URL, the video title, author and
24 others. The information is stored in a dictionary which is then
25 passed to the FileDownloader. The FileDownloader processes this
26 information possibly downloading the video to the file system, among
27 other possible outcomes.
28
29 The dictionaries must include the following fields:
30
31 id: Video identifier.
32 url: Final video URL.
33 uploader: Nickname of the video uploader, unescaped.
34 upload_date: Video upload date (YYYYMMDD).
35 title: Video title, unescaped.
36 ext: Video filename extension.
37
38 The following fields are optional:
39
40 format: The video format, defaults to ext (used for --get-format)
41 thumbnail: Full URL to a video thumbnail image.
42 description: One-line video description.
43 player_url: SWF Player URL (used for rtmpdump).
44 subtitles: The .srt file contents.
45 urlhandle: [internal] The urlHandle to be used to download the file,
46 like returned by urllib.request.urlopen
47
48 The fields should all be Unicode strings.
49
50 Subclasses of this one should re-define the _real_initialize() and
51 _real_extract() methods and define a _VALID_URL regexp.
52 Probably, they should also be added to the list of extractors.
53
54 _real_extract() must return a *list* of information dictionaries as
55 described above.
56
57 Finally, the _WORKING attribute should be set to False for broken IEs
58 in order to warn the users and skip the tests.
59 """
60
61 _ready = False
62 _downloader = None
63 _WORKING = True
64
65 def __init__(self, downloader=None):
66 """Constructor. Receives an optional downloader."""
67 self._ready = False
68 self.set_downloader(downloader)
69
70 def suitable(self, url):
71 """Receives a URL and returns True if suitable for this IE."""
72 return re.match(self._VALID_URL, url) is not None
73
74 def working(self):
75 """Getter method for _WORKING."""
76 return self._WORKING
77
78 def initialize(self):
79 """Initializes an instance (authentication, etc)."""
80 if not self._ready:
81 self._real_initialize()
82 self._ready = True
83
84 def extract(self, url):
85 """Extracts URL information and returns it in list of dicts."""
86 self.initialize()
87 return self._real_extract(url)
88
89 def set_downloader(self, downloader):
90 """Sets the downloader for this IE."""
91 self._downloader = downloader
92
93 def _real_initialize(self):
94 """Real initialization process. Redefine in subclasses."""
95 pass
96
97 def _real_extract(self, url):
98 """Real extraction process. Redefine in subclasses."""
99 pass
100
101
102 class YoutubeIE(InfoExtractor):
103 """Information extractor for youtube.com."""
104
105 _VALID_URL = r"""^
106 (
107 (?:https?://)? # http(s):// (optional)
108 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
109 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
110 (?:.*?\#/)? # handle anchor (#/) redirect urls
111 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
112 (?: # the various things that can precede the ID:
113 (?:(?:v|embed|e)/) # v/ or embed/ or e/
114 |(?: # or the v= param in all its forms
115 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
116 (?:\?|\#!?) # the params delimiter ? or # or #!
117 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
118 v=
119 )
120 )? # optional -> youtube.com/xxxx is OK
121 )? # all until now is optional -> you can pass the naked ID
122 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
123 (?(1).+)? # if we found the ID, everything can follow
124 $"""
125 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
126 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
127 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
128 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
129 _NETRC_MACHINE = 'youtube'
130 # Listed in order of quality
131 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
132 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
133 _video_extensions = {
134 '13': '3gp',
135 '17': 'mp4',
136 '18': 'mp4',
137 '22': 'mp4',
138 '37': 'mp4',
139 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
140 '43': 'webm',
141 '44': 'webm',
142 '45': 'webm',
143 '46': 'webm',
144 }
145 _video_dimensions = {
146 '5': '240x400',
147 '6': '???',
148 '13': '???',
149 '17': '144x176',
150 '18': '360x640',
151 '22': '720x1280',
152 '34': '360x640',
153 '35': '480x854',
154 '37': '1080x1920',
155 '38': '3072x4096',
156 '43': '360x640',
157 '44': '480x854',
158 '45': '720x1280',
159 '46': '1080x1920',
160 }
161 IE_NAME = u'youtube'
162
163 def suitable(self, url):
164 """Receives a URL and returns True if suitable for this IE."""
165 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
166
167 def report_lang(self):
168 """Report attempt to set language."""
169 self._downloader.to_screen(u'[youtube] Setting language')
170
171 def report_login(self):
172 """Report attempt to log in."""
173 self._downloader.to_screen(u'[youtube] Logging in')
174
175 def report_age_confirmation(self):
176 """Report attempt to confirm age."""
177 self._downloader.to_screen(u'[youtube] Confirming age')
178
179 def report_video_webpage_download(self, video_id):
180 """Report attempt to download video webpage."""
181 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
182
183 def report_video_info_webpage_download(self, video_id):
184 """Report attempt to download video info webpage."""
185 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
186
187 def report_video_subtitles_download(self, video_id):
188 """Report attempt to download video info webpage."""
189 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
190
191 def report_information_extraction(self, video_id):
192 """Report attempt to extract video information."""
193 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
194
195 def report_unavailable_format(self, video_id, format):
196 """Report extracted video URL."""
197 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
198
199 def report_rtmp_download(self):
200 """Indicate the download will use the RTMP protocol."""
201 self._downloader.to_screen(u'[youtube] RTMP download detected')
202
203 def _closed_captions_xml_to_srt(self, xml_string):
204 srt = ''
205 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
206 # TODO parse xml instead of regex
207 for n, (start, dur_tag, dur, caption) in enumerate(texts):
208 if not dur: dur = '4'
209 start = float(start)
210 end = start + float(dur)
211 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
212 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
213 caption = unescapeHTML(caption)
214 caption = unescapeHTML(caption) # double cycle, intentional
215 srt += str(n+1) + '\n'
216 srt += start + ' --> ' + end + '\n'
217 srt += caption + '\n\n'
218 return srt
219
220 def _print_formats(self, formats):
221 print('Available formats:')
222 for x in formats:
223 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
224
225 def _real_initialize(self):
226 if self._downloader is None:
227 return
228
229 username = None
230 password = None
231 downloader_params = self._downloader.params
232
233 # Attempt to use provided username and password or .netrc data
234 if downloader_params.get('username', None) is not None:
235 username = downloader_params['username']
236 password = downloader_params['password']
237 elif downloader_params.get('usenetrc', False):
238 try:
239 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
240 if info is not None:
241 username = info[0]
242 password = info[2]
243 else:
244 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
245 except (IOError, netrc.NetrcParseError) as err:
246 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
247 return
248
249 # Set language
250 request = compat_urllib_request.Request(self._LANG_URL)
251 try:
252 self.report_lang()
253 compat_urllib_request.urlopen(request).read()
254 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
255 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
256 return
257
258 # No authentication to be performed
259 if username is None:
260 return
261
262 # Log in
263 login_form = {
264 'current_form': 'loginForm',
265 'next': '/',
266 'action_login': 'Log In',
267 'username': username,
268 'password': password,
269 }
270 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
271 try:
272 self.report_login()
273 login_results = compat_urllib_request.urlopen(request).read()
274 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
275 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
276 return
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
279 return
280
281 # Confirm age
282 age_form = {
283 'next_url': '/',
284 'action_confirm': 'Confirm',
285 }
286 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
287 try:
288 self.report_age_confirmation()
289 age_results = compat_urllib_request.urlopen(request).read()
290 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
292 return
293
294 def _real_extract(self, url):
295 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
296 mobj = re.search(self._NEXT_URL_RE, url)
297 if mobj:
298 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
299
300 # Extract video id from URL
301 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
302 if mobj is None:
303 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
304 return
305 video_id = mobj.group(2)
306
307 # Get video webpage
308 self.report_video_webpage_download(video_id)
309 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
310 try:
311 video_webpage = compat_urllib_request.urlopen(request).read()
312 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
313 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
314 return
315
316 # Attempt to extract SWF player URL
317 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
318 if mobj is not None:
319 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
320 else:
321 player_url = None
322
323 # Get video info
324 self.report_video_info_webpage_download(video_id)
325 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
326 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
327 % (video_id, el_type))
328 request = compat_urllib_request.Request(video_info_url)
329 try:
330 video_info_webpage = compat_urllib_request.urlopen(request).read()
331 video_info = compat_parse_qs(video_info_webpage)
332 if 'token' in video_info:
333 break
334 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
335 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
336 return
337 if 'token' not in video_info:
338 if 'reason' in video_info:
339 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
340 else:
341 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
342 return
343
344 # Check for "rental" videos
345 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
346 self._downloader.trouble(u'ERROR: "rental" videos not supported')
347 return
348
349 # Start extracting information
350 self.report_information_extraction(video_id)
351
352 # uploader
353 if 'author' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
355 return
356 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
357
358 # title
359 if 'title' not in video_info:
360 self._downloader.trouble(u'ERROR: unable to extract video title')
361 return
362 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
363 video_title = video_title.decode('utf-8')
364
365 # thumbnail image
366 if 'thumbnail_url' not in video_info:
367 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
368 video_thumbnail = ''
369 else: # don't panic if we can't find it
370 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
371
372 # upload date
373 upload_date = None
374 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
375 if mobj is not None:
376 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
377 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
378 for expression in format_expressions:
379 try:
380 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
381 except:
382 pass
383
384 # description
385 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
386 if video_description: video_description = clean_html(video_description)
387 else: video_description = ''
388
389 # closed captions
390 video_subtitles = None
391 if self._downloader.params.get('writesubtitles', False):
392 try:
393 self.report_video_subtitles_download(video_id)
394 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
395 try:
396 srt_list = compat_urllib_request.urlopen(request).read()
397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
398 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
399 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
400 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
401 if not srt_lang_list:
402 raise Trouble(u'WARNING: video has no closed captions')
403 if self._downloader.params.get('subtitleslang', False):
404 srt_lang = self._downloader.params.get('subtitleslang')
405 elif 'en' in srt_lang_list:
406 srt_lang = 'en'
407 else:
408 srt_lang = srt_lang_list.keys()[0]
409 if not srt_lang in srt_lang_list:
410 raise Trouble(u'WARNING: no closed captions found in the specified language')
411 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
412 try:
413 srt_xml = compat_urllib_request.urlopen(request).read()
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
416 if not srt_xml:
417 raise Trouble(u'WARNING: unable to download video subtitles')
418 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
419 except Trouble as trouble:
420 self._downloader.trouble(trouble[0])
421
422 if 'length_seconds' not in video_info:
423 self._downloader.trouble(u'WARNING: unable to extract video duration')
424 video_duration = ''
425 else:
426 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
427
428 # token
429 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
430
431 # Decide which formats to download
432 req_format = self._downloader.params.get('format', None)
433
434 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
435 self.report_rtmp_download()
436 video_url_list = [(None, video_info['conn'][0])]
437 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
438 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
439 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
440 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
441 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
442
443 format_limit = self._downloader.params.get('format_limit', None)
444 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
445 if format_limit is not None and format_limit in available_formats:
446 format_list = available_formats[available_formats.index(format_limit):]
447 else:
448 format_list = available_formats
449 existing_formats = [x for x in format_list if x in url_map]
450 if len(existing_formats) == 0:
451 self._downloader.trouble(u'ERROR: no known formats available for video')
452 return
453 if self._downloader.params.get('listformats', None):
454 self._print_formats(existing_formats)
455 return
456 if req_format is None or req_format == 'best':
457 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
458 elif req_format == 'worst':
459 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
460 elif req_format in ('-1', 'all'):
461 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
462 else:
463 # Specific formats. We pick the first in a slash-delimeted sequence.
464 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
465 req_formats = req_format.split('/')
466 video_url_list = None
467 for rf in req_formats:
468 if rf in url_map:
469 video_url_list = [(rf, url_map[rf])]
470 break
471 if video_url_list is None:
472 self._downloader.trouble(u'ERROR: requested format not available')
473 return
474 else:
475 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
476 return
477
478 results = []
479 for format_param, video_real_url in video_url_list:
480 # Extension
481 video_extension = self._video_extensions.get(format_param, 'flv')
482
483 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
484 self._video_dimensions.get(format_param, '???'))
485
486 results.append({
487 'id': video_id.decode('utf-8'),
488 'url': video_real_url.decode('utf-8'),
489 'uploader': video_uploader.decode('utf-8'),
490 'upload_date': upload_date,
491 'title': video_title,
492 'ext': video_extension.decode('utf-8'),
493 'format': video_format,
494 'thumbnail': video_thumbnail.decode('utf-8'),
495 'description': video_description,
496 'player_url': player_url,
497 'subtitles': video_subtitles,
498 'duration': video_duration
499 })
500 return results
501
502
503 class MetacafeIE(InfoExtractor):
504 """Information Extractor for metacafe.com."""
505
506 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
507 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
508 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
509 IE_NAME = u'metacafe'
510
511 def __init__(self, downloader=None):
512 InfoExtractor.__init__(self, downloader)
513
514 def report_disclaimer(self):
515 """Report disclaimer retrieval."""
516 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
517
518 def report_age_confirmation(self):
519 """Report attempt to confirm age."""
520 self._downloader.to_screen(u'[metacafe] Confirming age')
521
522 def report_download_webpage(self, video_id):
523 """Report webpage download."""
524 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
525
526 def report_extraction(self, video_id):
527 """Report information extraction."""
528 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
529
530 def _real_initialize(self):
531 # Retrieve disclaimer
532 request = compat_urllib_request.Request(self._DISCLAIMER)
533 try:
534 self.report_disclaimer()
535 disclaimer = compat_urllib_request.urlopen(request).read()
536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
538 return
539
540 # Confirm age
541 disclaimer_form = {
542 'filters': '0',
543 'submit': "Continue - I'm over 18",
544 }
545 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
546 try:
547 self.report_age_confirmation()
548 disclaimer = compat_urllib_request.urlopen(request).read()
549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
550 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
551 return
552
553 def _real_extract(self, url):
554 # Extract id and simplified title from URL
555 mobj = re.match(self._VALID_URL, url)
556 if mobj is None:
557 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
558 return
559
560 video_id = mobj.group(1)
561
562 # Check if video comes from YouTube
563 mobj2 = re.match(r'^yt-(.*)$', video_id)
564 if mobj2 is not None:
565 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
566 return
567
568 # Retrieve video webpage to extract further information
569 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
570 try:
571 self.report_download_webpage(video_id)
572 webpage = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
575 return
576
577 # Extract URL, uploader and title from webpage
578 self.report_extraction(video_id)
579 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
580 if mobj is not None:
581 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
582 video_extension = mediaURL[-3:]
583
584 # Extract gdaKey if available
585 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
586 if mobj is None:
587 video_url = mediaURL
588 else:
589 gdaKey = mobj.group(1)
590 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
591 else:
592 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
593 if mobj is None:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 vardict = compat_parse_qs(mobj.group(1))
597 if 'mediaData' not in vardict:
598 self._downloader.trouble(u'ERROR: unable to extract media URL')
599 return
600 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: unable to extract media URL')
603 return
604 mediaURL = mobj.group(1).replace('\\/', '/')
605 video_extension = mediaURL[-3:]
606 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
607
608 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
609 if mobj is None:
610 self._downloader.trouble(u'ERROR: unable to extract title')
611 return
612 video_title = mobj.group(1).decode('utf-8')
613
614 mobj = re.search(r'submitter=(.*?);', webpage)
615 if mobj is None:
616 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
617 return
618 video_uploader = mobj.group(1)
619
620 return [{
621 'id': video_id.decode('utf-8'),
622 'url': video_url.decode('utf-8'),
623 'uploader': video_uploader.decode('utf-8'),
624 'upload_date': None,
625 'title': video_title,
626 'ext': video_extension.decode('utf-8'),
627 }]
628
629
630 class DailymotionIE(InfoExtractor):
631 """Information Extractor for Dailymotion"""
632
633 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
634 IE_NAME = u'dailymotion'
635
636 def __init__(self, downloader=None):
637 InfoExtractor.__init__(self, downloader)
638
639 def report_download_webpage(self, video_id):
640 """Report webpage download."""
641 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
642
643 def report_extraction(self, video_id):
644 """Report information extraction."""
645 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
646
647 def _real_extract(self, url):
648 # Extract id and simplified title from URL
649 mobj = re.match(self._VALID_URL, url)
650 if mobj is None:
651 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
652 return
653
654 video_id = mobj.group(1).split('_')[0].split('?')[0]
655
656 video_extension = 'mp4'
657
658 # Retrieve video webpage to extract further information
659 request = compat_urllib_request.Request(url)
660 request.add_header('Cookie', 'family_filter=off')
661 try:
662 self.report_download_webpage(video_id)
663 webpage = compat_urllib_request.urlopen(request).read()
664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
665 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
666 return
667
668 # Extract URL, uploader and title from webpage
669 self.report_extraction(video_id)
670 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
671 if mobj is None:
672 self._downloader.trouble(u'ERROR: unable to extract media URL')
673 return
674 flashvars = compat_urllib_parse.unquote(mobj.group(1))
675
676 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
677 if key in flashvars:
678 max_quality = key
679 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
680 break
681 else:
682 self._downloader.trouble(u'ERROR: unable to extract video URL')
683 return
684
685 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
686 if mobj is None:
687 self._downloader.trouble(u'ERROR: unable to extract video URL')
688 return
689
690 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
691
692 # TODO: support choosing qualities
693
694 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
695 if mobj is None:
696 self._downloader.trouble(u'ERROR: unable to extract title')
697 return
698 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
699
700 video_uploader = None
701 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
702 if mobj is None:
703 # lookin for official user
704 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
705 if mobj_official is None:
706 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
707 else:
708 video_uploader = mobj_official.group(1)
709 else:
710 video_uploader = mobj.group(1)
711
712 video_upload_date = None
713 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
714 if mobj is not None:
715 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
716
717 return [{
718 'id': video_id.decode('utf-8'),
719 'url': video_url.decode('utf-8'),
720 'uploader': video_uploader.decode('utf-8'),
721 'upload_date': video_upload_date,
722 'title': video_title,
723 'ext': video_extension.decode('utf-8'),
724 }]
725
726
727 class GoogleIE(InfoExtractor):
728 """Information extractor for video.google.com."""
729
730 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
731 IE_NAME = u'video.google'
732
733 def __init__(self, downloader=None):
734 InfoExtractor.__init__(self, downloader)
735
736 def report_download_webpage(self, video_id):
737 """Report webpage download."""
738 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
739
740 def report_extraction(self, video_id):
741 """Report information extraction."""
742 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
743
744 def _real_extract(self, url):
745 # Extract id from URL
746 mobj = re.match(self._VALID_URL, url)
747 if mobj is None:
748 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
749 return
750
751 video_id = mobj.group(1)
752
753 video_extension = 'mp4'
754
755 # Retrieve video webpage to extract further information
756 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
757 try:
758 self.report_download_webpage(video_id)
759 webpage = compat_urllib_request.urlopen(request).read()
760 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
761 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
762 return
763
764 # Extract URL, uploader, and title from webpage
765 self.report_extraction(video_id)
766 mobj = re.search(r"download_url:'([^']+)'", webpage)
767 if mobj is None:
768 video_extension = 'flv'
769 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
770 if mobj is None:
771 self._downloader.trouble(u'ERROR: unable to extract media URL')
772 return
773 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
774 mediaURL = mediaURL.replace('\\x3d', '\x3d')
775 mediaURL = mediaURL.replace('\\x26', '\x26')
776
777 video_url = mediaURL
778
779 mobj = re.search(r'<title>(.*)</title>', webpage)
780 if mobj is None:
781 self._downloader.trouble(u'ERROR: unable to extract title')
782 return
783 video_title = mobj.group(1).decode('utf-8')
784
785 # Extract video description
786 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
787 if mobj is None:
788 self._downloader.trouble(u'ERROR: unable to extract video description')
789 return
790 video_description = mobj.group(1).decode('utf-8')
791 if not video_description:
792 video_description = 'No description available.'
793
794 # Extract video thumbnail
795 if self._downloader.params.get('forcethumbnail', False):
796 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
797 try:
798 webpage = compat_urllib_request.urlopen(request).read()
799 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
801 return
802 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
803 if mobj is None:
804 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
805 return
806 video_thumbnail = mobj.group(1)
807 else: # we need something to pass to process_info
808 video_thumbnail = ''
809
810 return [{
811 'id': video_id.decode('utf-8'),
812 'url': video_url.decode('utf-8'),
813 'uploader': None,
814 'upload_date': None,
815 'title': video_title,
816 'ext': video_extension.decode('utf-8'),
817 }]
818
819
820 class PhotobucketIE(InfoExtractor):
821 """Information extractor for photobucket.com."""
822
823 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
824 IE_NAME = u'photobucket'
825
826 def __init__(self, downloader=None):
827 InfoExtractor.__init__(self, downloader)
828
829 def report_download_webpage(self, video_id):
830 """Report webpage download."""
831 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
832
833 def report_extraction(self, video_id):
834 """Report information extraction."""
835 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
836
837 def _real_extract(self, url):
838 # Extract id from URL
839 mobj = re.match(self._VALID_URL, url)
840 if mobj is None:
841 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
842 return
843
844 video_id = mobj.group(1)
845
846 video_extension = 'flv'
847
848 # Retrieve video webpage to extract further information
849 request = compat_urllib_request.Request(url)
850 try:
851 self.report_download_webpage(video_id)
852 webpage = compat_urllib_request.urlopen(request).read()
853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
854 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
855 return
856
857 # Extract URL, uploader, and title from webpage
858 self.report_extraction(video_id)
859 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
860 if mobj is None:
861 self._downloader.trouble(u'ERROR: unable to extract media URL')
862 return
863 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
864
865 video_url = mediaURL
866
867 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
868 if mobj is None:
869 self._downloader.trouble(u'ERROR: unable to extract title')
870 return
871 video_title = mobj.group(1).decode('utf-8')
872
873 video_uploader = mobj.group(2).decode('utf-8')
874
875 return [{
876 'id': video_id.decode('utf-8'),
877 'url': video_url.decode('utf-8'),
878 'uploader': video_uploader,
879 'upload_date': None,
880 'title': video_title,
881 'ext': video_extension.decode('utf-8'),
882 }]
883
884
885 class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
887
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
893
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
896
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
910 return
911
912 video_id = mobj.group(2)
913 video_extension = 'flv'
914
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = compat_urllib_request.Request(url)
919 try:
920 webpage = compat_urllib_request.urlopen(request).read()
921 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
923 return
924
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
928 return
929 yahoo_id = mobj.group(1)
930
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934 return
935 yahoo_vid = mobj.group(1)
936
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
939
940 # Retrieve video webpage to extract further information
941 request = compat_urllib_request.Request(url)
942 try:
943 self.report_download_webpage(video_id)
944 webpage = compat_urllib_request.urlopen(request).read()
945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
947 return
948
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952 if mobj is None:
953 self._downloader.trouble(u'ERROR: unable to extract video title')
954 return
955 video_title = mobj.group(1).decode('utf-8')
956
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
960 return
961 video_uploader = mobj.group(1).decode('utf-8')
962
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965 if mobj is None:
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967 return
968 video_thumbnail = mobj.group(1).decode('utf-8')
969
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video description')
974 return
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
978
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: unable to extract video height')
983 return
984 yv_video_height = mobj.group(1)
985
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video width')
989 return
990 yv_video_width = mobj.group(1)
991
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000 try:
1001 self.report_download_webpage(video_id)
1002 webpage = compat_urllib_request.urlopen(request).read()
1003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005 return
1006
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009 if mobj is None:
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011 return
1012 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013 video_url = unescapeHTML(video_url)
1014
1015 return [{
1016 'id': video_id.decode('utf-8'),
1017 'url': video_url,
1018 'uploader': video_uploader,
1019 'upload_date': None,
1020 'title': video_title,
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1024 }]
1025
1026
1027 class VimeoIE(InfoExtractor):
1028 """Information extractor for vimeo.com."""
1029
1030 # _VALID_URL matches Vimeo URLs
1031 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1032 IE_NAME = u'vimeo'
1033
1034 def __init__(self, downloader=None):
1035 InfoExtractor.__init__(self, downloader)
1036
1037 def report_download_webpage(self, video_id):
1038 """Report webpage download."""
1039 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1040
1041 def report_extraction(self, video_id):
1042 """Report information extraction."""
1043 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1044
1045 def _real_extract(self, url, new_video=True):
1046 # Extract ID from URL
1047 mobj = re.match(self._VALID_URL, url)
1048 if mobj is None:
1049 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1050 return
1051
1052 video_id = mobj.group(1)
1053
1054 # Retrieve video webpage to extract further information
1055 request = compat_urllib_request.Request(url, None, std_headers)
1056 try:
1057 self.report_download_webpage(video_id)
1058 webpage = compat_urllib_request.urlopen(request).read()
1059 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1060 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1061 return
1062
1063 # Now we begin extracting as much information as we can from what we
1064 # retrieved. First we extract the information common to all extractors,
1065 # and latter we extract those that are Vimeo specific.
1066 self.report_extraction(video_id)
1067
1068 # Extract the config JSON
1069 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1070 try:
1071 config = json.loads(config)
1072 except:
1073 self._downloader.trouble(u'ERROR: unable to extract info section')
1074 return
1075
1076 # Extract title
1077 video_title = config["video"]["title"]
1078
1079 # Extract uploader
1080 video_uploader = config["video"]["owner"]["name"]
1081
1082 # Extract video thumbnail
1083 video_thumbnail = config["video"]["thumbnail"]
1084
1085 # Extract video description
1086 video_description = get_element_by_id("description", webpage.decode('utf8'))
1087 if video_description: video_description = clean_html(video_description)
1088 else: video_description = ''
1089
1090 # Extract upload date
1091 video_upload_date = None
1092 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1093 if mobj is not None:
1094 video_upload_date = mobj.group(1)
1095
1096 # Vimeo specific: extract request signature and timestamp
1097 sig = config['request']['signature']
1098 timestamp = config['request']['timestamp']
1099
1100 # Vimeo specific: extract video codec and quality information
1101 # First consider quality, then codecs, then take everything
1102 # TODO bind to format param
1103 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1104 files = { 'hd': [], 'sd': [], 'other': []}
1105 for codec_name, codec_extension in codecs:
1106 if codec_name in config["video"]["files"]:
1107 if 'hd' in config["video"]["files"][codec_name]:
1108 files['hd'].append((codec_name, codec_extension, 'hd'))
1109 elif 'sd' in config["video"]["files"][codec_name]:
1110 files['sd'].append((codec_name, codec_extension, 'sd'))
1111 else:
1112 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1113
1114 for quality in ('hd', 'sd', 'other'):
1115 if len(files[quality]) > 0:
1116 video_quality = files[quality][0][2]
1117 video_codec = files[quality][0][0]
1118 video_extension = files[quality][0][1]
1119 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1120 break
1121 else:
1122 self._downloader.trouble(u'ERROR: no known codec found')
1123 return
1124
1125 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1126 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1127
1128 return [{
1129 'id': video_id,
1130 'url': video_url,
1131 'uploader': video_uploader,
1132 'upload_date': video_upload_date,
1133 'title': video_title,
1134 'ext': video_extension,
1135 'thumbnail': video_thumbnail,
1136 'description': video_description,
1137 }]
1138
1139
1140 class ArteTvIE(InfoExtractor):
1141 """arte.tv information extractor."""
1142
1143 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1144 _LIVE_URL = r'index-[0-9]+\.html$'
1145
1146 IE_NAME = u'arte.tv'
1147
1148 def __init__(self, downloader=None):
1149 InfoExtractor.__init__(self, downloader)
1150
1151 def report_download_webpage(self, video_id):
1152 """Report webpage download."""
1153 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1154
1155 def report_extraction(self, video_id):
1156 """Report information extraction."""
1157 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1158
1159 def fetch_webpage(self, url):
1160 self._downloader.increment_downloads()
1161 request = compat_urllib_request.Request(url)
1162 try:
1163 self.report_download_webpage(url)
1164 webpage = compat_urllib_request.urlopen(request).read()
1165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1166 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1167 return
1168 except ValueError as err:
1169 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1170 return
1171 return webpage
1172
1173 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1174 page = self.fetch_webpage(url)
1175 mobj = re.search(regex, page, regexFlags)
1176 info = {}
1177
1178 if mobj is None:
1179 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180 return
1181
1182 for (i, key, err) in matchTuples:
1183 if mobj.group(i) is None:
1184 self._downloader.trouble(err)
1185 return
1186 else:
1187 info[key] = mobj.group(i)
1188
1189 return info
1190
1191 def extractLiveStream(self, url):
1192 video_lang = url.split('/')[-4]
1193 info = self.grep_webpage(
1194 url,
1195 r'src="(.*?/videothek_js.*?\.js)',
1196 0,
1197 [
1198 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1199 ]
1200 )
1201 http_host = url.split('/')[2]
1202 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1203 info = self.grep_webpage(
1204 next_url,
1205 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1206 '(http://.*?\.swf).*?' +
1207 '(rtmp://.*?)\'',
1208 re.DOTALL,
1209 [
1210 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1211 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1212 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1213 ]
1214 )
1215 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1216
1217 def extractPlus7Stream(self, url):
1218 video_lang = url.split('/')[-3]
1219 info = self.grep_webpage(
1220 url,
1221 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1222 0,
1223 [
1224 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1225 ]
1226 )
1227 next_url = compat_urllib_parse.unquote(info.get('url'))
1228 info = self.grep_webpage(
1229 next_url,
1230 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1231 0,
1232 [
1233 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1234 ]
1235 )
1236 next_url = compat_urllib_parse.unquote(info.get('url'))
1237
1238 info = self.grep_webpage(
1239 next_url,
1240 r'<video id="(.*?)".*?>.*?' +
1241 '<name>(.*?)</name>.*?' +
1242 '<dateVideo>(.*?)</dateVideo>.*?' +
1243 '<url quality="hd">(.*?)</url>',
1244 re.DOTALL,
1245 [
1246 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1247 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1248 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1249 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1250 ]
1251 )
1252
1253 return {
1254 'id': info.get('id'),
1255 'url': compat_urllib_parse.unquote(info.get('url')),
1256 'uploader': u'arte.tv',
1257 'upload_date': info.get('date'),
1258 'title': info.get('title'),
1259 'ext': u'mp4',
1260 'format': u'NA',
1261 'player_url': None,
1262 }
1263
1264 def _real_extract(self, url):
1265 video_id = url.split('/')[-1]
1266 self.report_extraction(video_id)
1267
1268 if re.search(self._LIVE_URL, video_id) is not None:
1269 self.extractLiveStream(url)
1270 return
1271 else:
1272 info = self.extractPlus7Stream(url)
1273
1274 return [info]
1275
1276
1277 class GenericIE(InfoExtractor):
1278 """Generic last-resort information extractor."""
1279
1280 _VALID_URL = r'.*'
1281 IE_NAME = u'generic'
1282
1283 def __init__(self, downloader=None):
1284 InfoExtractor.__init__(self, downloader)
1285
1286 def report_download_webpage(self, video_id):
1287 """Report webpage download."""
1288 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1289 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1290
1291 def report_extraction(self, video_id):
1292 """Report information extraction."""
1293 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1294
1295 def report_following_redirect(self, new_url):
1296 """Report information extraction."""
1297 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1298
1299 def _test_redirect(self, url):
1300 """Check if it is a redirect, like url shorteners, in case restart chain."""
1301 class HeadRequest(compat_urllib_request.Request):
1302 def get_method(self):
1303 return "HEAD"
1304
1305 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1306 """
1307 Subclass the HTTPRedirectHandler to make it use our
1308 HeadRequest also on the redirected URL
1309 """
1310 def redirect_request(self, req, fp, code, msg, headers, newurl):
1311 if code in (301, 302, 303, 307):
1312 newurl = newurl.replace(' ', '%20')
1313 newheaders = dict((k,v) for k,v in req.headers.items()
1314 if k.lower() not in ("content-length", "content-type"))
1315 return HeadRequest(newurl,
1316 headers=newheaders,
1317 origin_req_host=req.get_origin_req_host(),
1318 unverifiable=True)
1319 else:
1320 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1321
1322 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1323 """
1324 Fallback to GET if HEAD is not allowed (405 HTTP error)
1325 """
1326 def http_error_405(self, req, fp, code, msg, headers):
1327 fp.read()
1328 fp.close()
1329
1330 newheaders = dict((k,v) for k,v in req.headers.items()
1331 if k.lower() not in ("content-length", "content-type"))
1332 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1333 headers=newheaders,
1334 origin_req_host=req.get_origin_req_host(),
1335 unverifiable=True))
1336
1337 # Build our opener
1338 opener = compat_urllib_request.OpenerDirector()
1339 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340 HTTPMethodFallback, HEADRedirectHandler,
1341 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342 opener.add_handler(handler())
1343
1344 response = opener.open(HeadRequest(url))
1345 new_url = response.geturl()
1346
1347 if url == new_url:
1348 return False
1349
1350 self.report_following_redirect(new_url)
1351 self._downloader.download([new_url])
1352 return True
1353
1354 def _real_extract(self, url):
1355 if self._test_redirect(url): return
1356
1357 video_id = url.split('/')[-1]
1358 request = compat_urllib_request.Request(url)
1359 try:
1360 self.report_download_webpage(video_id)
1361 webpage = compat_urllib_request.urlopen(request).read()
1362 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1364 return
1365 except ValueError as err:
1366 # since this is the last-resort InfoExtractor, if
1367 # this error is thrown, it'll be thrown here
1368 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1369 return
1370
1371 self.report_extraction(video_id)
1372 # Start with something easy: JW Player in SWFObject
1373 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1374 if mobj is None:
1375 # Broaden the search a little bit
1376 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1377 if mobj is None:
1378 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379 return
1380
1381 # It's possible that one of the regexes
1382 # matched, but returned an empty group:
1383 if mobj.group(1) is None:
1384 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385 return
1386
1387 video_url = compat_urllib_parse.unquote(mobj.group(1))
1388 video_id = os.path.basename(video_url)
1389
1390 # here's a fun little line of code for you:
1391 video_extension = os.path.splitext(video_id)[1][1:]
1392 video_id = os.path.splitext(video_id)[0]
1393
1394 # it's tempting to parse this further, but you would
1395 # have to take into account all the variations like
1396 # Video Title - Site Name
1397 # Site Name | Video Title
1398 # Video Title - Tagline | Site Name
1399 # and so on and so forth; it's just not practical
1400 mobj = re.search(r'<title>(.*)</title>', webpage)
1401 if mobj is None:
1402 self._downloader.trouble(u'ERROR: unable to extract title')
1403 return
1404 video_title = mobj.group(1).decode('utf-8')
1405
1406 # video uploader is domain name
1407 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408 if mobj is None:
1409 self._downloader.trouble(u'ERROR: unable to extract title')
1410 return
1411 video_uploader = mobj.group(1).decode('utf-8')
1412
1413 return [{
1414 'id': video_id.decode('utf-8'),
1415 'url': video_url.decode('utf-8'),
1416 'uploader': video_uploader,
1417 'upload_date': None,
1418 'title': video_title,
1419 'ext': video_extension.decode('utf-8'),
1420 }]
1421
1422
1423 class YoutubeSearchIE(InfoExtractor):
1424 """Information Extractor for YouTube search queries."""
1425 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427 _max_youtube_results = 1000
1428 IE_NAME = u'youtube:search'
1429
1430 def __init__(self, downloader=None):
1431 InfoExtractor.__init__(self, downloader)
1432
1433 def report_download_page(self, query, pagenum):
1434 """Report attempt to download search page with given number."""
1435 query = query.decode(preferredencoding())
1436 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1437
1438 def _real_extract(self, query):
1439 mobj = re.match(self._VALID_URL, query)
1440 if mobj is None:
1441 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1442 return
1443
1444 prefix, query = query.split(':')
1445 prefix = prefix[8:]
1446 query = query.encode('utf-8')
1447 if prefix == '':
1448 self._download_n_results(query, 1)
1449 return
1450 elif prefix == 'all':
1451 self._download_n_results(query, self._max_youtube_results)
1452 return
1453 else:
1454 try:
1455 n = int(prefix)
1456 if n <= 0:
1457 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1458 return
1459 elif n > self._max_youtube_results:
1460 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1461 n = self._max_youtube_results
1462 self._download_n_results(query, n)
1463 return
1464 except ValueError: # parsing prefix as integer fails
1465 self._download_n_results(query, 1)
1466 return
1467
1468 def _download_n_results(self, query, n):
1469 """Downloads a specified number of results for a query"""
1470
1471 video_ids = []
1472 pagenum = 0
1473 limit = n
1474
1475 while (50 * pagenum) < limit:
1476 self.report_download_page(query, pagenum+1)
1477 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1478 request = compat_urllib_request.Request(result_url)
1479 try:
1480 data = compat_urllib_request.urlopen(request).read()
1481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1482 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1483 return
1484 api_response = json.loads(data)['data']
1485
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1488
1489 limit = min(n, api_response['totalItems'])
1490 pagenum += 1
1491
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
1494 for id in video_ids:
1495 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1496 return
1497
1498
1499 class GoogleSearchIE(InfoExtractor):
1500 """Information Extractor for Google Video search queries."""
1501 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1502 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1503 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1504 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1505 _max_google_results = 1000
1506 IE_NAME = u'video.google:search'
1507
1508 def __init__(self, downloader=None):
1509 InfoExtractor.__init__(self, downloader)
1510
1511 def report_download_page(self, query, pagenum):
1512 """Report attempt to download playlist page with given number."""
1513 query = query.decode(preferredencoding())
1514 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1515
1516 def _real_extract(self, query):
1517 mobj = re.match(self._VALID_URL, query)
1518 if mobj is None:
1519 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1520 return
1521
1522 prefix, query = query.split(':')
1523 prefix = prefix[8:]
1524 query = query.encode('utf-8')
1525 if prefix == '':
1526 self._download_n_results(query, 1)
1527 return
1528 elif prefix == 'all':
1529 self._download_n_results(query, self._max_google_results)
1530 return
1531 else:
1532 try:
1533 n = int(prefix)
1534 if n <= 0:
1535 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1536 return
1537 elif n > self._max_google_results:
1538 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1539 n = self._max_google_results
1540 self._download_n_results(query, n)
1541 return
1542 except ValueError: # parsing prefix as integer fails
1543 self._download_n_results(query, 1)
1544 return
1545
1546 def _download_n_results(self, query, n):
1547 """Downloads a specified number of results for a query"""
1548
1549 video_ids = []
1550 pagenum = 0
1551
1552 while True:
1553 self.report_download_page(query, pagenum)
1554 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1555 request = compat_urllib_request.Request(result_url)
1556 try:
1557 page = compat_urllib_request.urlopen(request).read()
1558 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1559 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1560 return
1561
1562 # Extract video identifiers
1563 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1564 video_id = mobj.group(1)
1565 if video_id not in video_ids:
1566 video_ids.append(video_id)
1567 if len(video_ids) == n:
1568 # Specified n videos reached
1569 for id in video_ids:
1570 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571 return
1572
1573 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1574 for id in video_ids:
1575 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1576 return
1577
1578 pagenum = pagenum + 1
1579
1580
1581 class YahooSearchIE(InfoExtractor):
1582 """Information Extractor for Yahoo! Video search queries."""
1583 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1584 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1585 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1586 _MORE_PAGES_INDICATOR = r'\s*Next'
1587 _max_yahoo_results = 1000
1588 IE_NAME = u'video.yahoo:search'
1589
1590 def __init__(self, downloader=None):
1591 InfoExtractor.__init__(self, downloader)
1592
1593 def report_download_page(self, query, pagenum):
1594 """Report attempt to download playlist page with given number."""
1595 query = query.decode(preferredencoding())
1596 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1597
1598 def _real_extract(self, query):
1599 mobj = re.match(self._VALID_URL, query)
1600 if mobj is None:
1601 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1602 return
1603
1604 prefix, query = query.split(':')
1605 prefix = prefix[8:]
1606 query = query.encode('utf-8')
1607 if prefix == '':
1608 self._download_n_results(query, 1)
1609 return
1610 elif prefix == 'all':
1611 self._download_n_results(query, self._max_yahoo_results)
1612 return
1613 else:
1614 try:
1615 n = int(prefix)
1616 if n <= 0:
1617 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1618 return
1619 elif n > self._max_yahoo_results:
1620 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621 n = self._max_yahoo_results
1622 self._download_n_results(query, n)
1623 return
1624 except ValueError: # parsing prefix as integer fails
1625 self._download_n_results(query, 1)
1626 return
1627
1628 def _download_n_results(self, query, n):
1629 """Downloads a specified number of results for a query"""
1630
1631 video_ids = []
1632 already_seen = set()
1633 pagenum = 1
1634
1635 while True:
1636 self.report_download_page(query, pagenum)
1637 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638 request = compat_urllib_request.Request(result_url)
1639 try:
1640 page = compat_urllib_request.urlopen(request).read()
1641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1643 return
1644
1645 # Extract video identifiers
1646 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647 video_id = mobj.group(1)
1648 if video_id not in already_seen:
1649 video_ids.append(video_id)
1650 already_seen.add(video_id)
1651 if len(video_ids) == n:
1652 # Specified n videos reached
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655 return
1656
1657 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660 return
1661
1662 pagenum = pagenum + 1
1663
1664
1665 class YoutubePlaylistIE(InfoExtractor):
1666 """Information Extractor for YouTube playlists."""
1667
1668 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1669 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1670 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1671 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1672 IE_NAME = u'youtube:playlist'
1673
1674 def __init__(self, downloader=None):
1675 InfoExtractor.__init__(self, downloader)
1676
1677 def report_download_page(self, playlist_id, pagenum):
1678 """Report attempt to download playlist page with given number."""
1679 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1680
1681 def _real_extract(self, url):
1682 # Extract playlist id
1683 mobj = re.match(self._VALID_URL, url)
1684 if mobj is None:
1685 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1686 return
1687
1688 # Single video case
1689 if mobj.group(3) is not None:
1690 self._downloader.download([mobj.group(3)])
1691 return
1692
1693 # Download playlist pages
1694 # prefix is 'p' as default for playlists but there are other types that need extra care
1695 playlist_prefix = mobj.group(1)
1696 if playlist_prefix == 'a':
1697 playlist_access = 'artist'
1698 else:
1699 playlist_prefix = 'p'
1700 playlist_access = 'view_play_list'
1701 playlist_id = mobj.group(2)
1702 video_ids = []
1703 pagenum = 1
1704
1705 while True:
1706 self.report_download_page(playlist_id, pagenum)
1707 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1708 request = compat_urllib_request.Request(url)
1709 try:
1710 page = compat_urllib_request.urlopen(request).read()
1711 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1713 return
1714
1715 # Extract video identifiers
1716 ids_in_page = []
1717 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1718 if mobj.group(1) not in ids_in_page:
1719 ids_in_page.append(mobj.group(1))
1720 video_ids.extend(ids_in_page)
1721
1722 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1723 break
1724 pagenum = pagenum + 1
1725
1726 playliststart = self._downloader.params.get('playliststart', 1) - 1
1727 playlistend = self._downloader.params.get('playlistend', -1)
1728 if playlistend == -1:
1729 video_ids = video_ids[playliststart:]
1730 else:
1731 video_ids = video_ids[playliststart:playlistend]
1732
1733 for id in video_ids:
1734 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1735 return
1736
1737
1738 class YoutubeChannelIE(InfoExtractor):
1739 """Information Extractor for YouTube channels."""
1740
1741 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1742 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1743 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1744 IE_NAME = u'youtube:channel'
1745
1746 def report_download_page(self, channel_id, pagenum):
1747 """Report attempt to download channel page with given number."""
1748 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1749
1750 def _real_extract(self, url):
1751 # Extract channel id
1752 mobj = re.match(self._VALID_URL, url)
1753 if mobj is None:
1754 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1755 return
1756
1757 # Download channel pages
1758 channel_id = mobj.group(1)
1759 video_ids = []
1760 pagenum = 1
1761
1762 while True:
1763 self.report_download_page(channel_id, pagenum)
1764 url = self._TEMPLATE_URL % (channel_id, pagenum)
1765 request = compat_urllib_request.Request(url)
1766 try:
1767 page = compat_urllib_request.urlopen(request).read()
1768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1770 return
1771
1772 # Extract video identifiers
1773 ids_in_page = []
1774 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1775 if mobj.group(1) not in ids_in_page:
1776 ids_in_page.append(mobj.group(1))
1777 video_ids.extend(ids_in_page)
1778
1779 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1780 break
1781 pagenum = pagenum + 1
1782
1783 for id in video_ids:
1784 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1785 return
1786
1787
1788 class YoutubeUserIE(InfoExtractor):
1789 """Information Extractor for YouTube users."""
1790
1791 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1792 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1793 _GDATA_PAGE_SIZE = 50
1794 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1795 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1796 IE_NAME = u'youtube:user'
1797
1798 def __init__(self, downloader=None):
1799 InfoExtractor.__init__(self, downloader)
1800
1801 def report_download_page(self, username, start_index):
1802 """Report attempt to download user page."""
1803 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1804 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1805
1806 def _real_extract(self, url):
1807 # Extract username
1808 mobj = re.match(self._VALID_URL, url)
1809 if mobj is None:
1810 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1811 return
1812
1813 username = mobj.group(1)
1814
1815 # Download video ids using YouTube Data API. Result size per
1816 # query is limited (currently to 50 videos) so we need to query
1817 # page by page until there are no video ids - it means we got
1818 # all of them.
1819
1820 video_ids = []
1821 pagenum = 0
1822
1823 while True:
1824 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1825 self.report_download_page(username, start_index)
1826
1827 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1828
1829 try:
1830 page = compat_urllib_request.urlopen(request).read()
1831 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1832 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1833 return
1834
1835 # Extract video identifiers
1836 ids_in_page = []
1837
1838 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1839 if mobj.group(1) not in ids_in_page:
1840 ids_in_page.append(mobj.group(1))
1841
1842 video_ids.extend(ids_in_page)
1843
1844 # A little optimization - if current page is not
1845 # "full", ie. does not contain PAGE_SIZE video ids then
1846 # we can assume that this page is the last one - there
1847 # are no more ids on further pages - no need to query
1848 # again.
1849
1850 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1851 break
1852
1853 pagenum += 1
1854
1855 all_ids_count = len(video_ids)
1856 playliststart = self._downloader.params.get('playliststart', 1) - 1
1857 playlistend = self._downloader.params.get('playlistend', -1)
1858
1859 if playlistend == -1:
1860 video_ids = video_ids[playliststart:]
1861 else:
1862 video_ids = video_ids[playliststart:playlistend]
1863
1864 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1865 (username, all_ids_count, len(video_ids)))
1866
1867 for video_id in video_ids:
1868 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1869
1870
1871 class BlipTVUserIE(InfoExtractor):
1872 """Information Extractor for blip.tv users."""
1873
1874 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1875 _PAGE_SIZE = 12
1876 IE_NAME = u'blip.tv:user'
1877
1878 def __init__(self, downloader=None):
1879 InfoExtractor.__init__(self, downloader)
1880
1881 def report_download_page(self, username, pagenum):
1882 """Report attempt to download user page."""
1883 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1884 (self.IE_NAME, username, pagenum))
1885
1886 def _real_extract(self, url):
1887 # Extract username
1888 mobj = re.match(self._VALID_URL, url)
1889 if mobj is None:
1890 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1891 return
1892
1893 username = mobj.group(1)
1894
1895 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1896
1897 request = compat_urllib_request.Request(url)
1898
1899 try:
1900 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1901 mobj = re.search(r'data-users-id="([^"]+)"', page)
1902 page_base = page_base % mobj.group(1)
1903 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1904 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1905 return
1906
1907
1908 # Download video ids using BlipTV Ajax calls. Result size per
1909 # query is limited (currently to 12 videos) so we need to query
1910 # page by page until there are no video ids - it means we got
1911 # all of them.
1912
1913 video_ids = []
1914 pagenum = 1
1915
1916 while True:
1917 self.report_download_page(username, pagenum)
1918
1919 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1920
1921 try:
1922 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1925 return
1926
1927 # Extract video identifiers
1928 ids_in_page = []
1929
1930 for mobj in re.finditer(r'href="/([^"]+)"', page):
1931 if mobj.group(1) not in ids_in_page:
1932 ids_in_page.append(unescapeHTML(mobj.group(1)))
1933
1934 video_ids.extend(ids_in_page)
1935
1936 # A little optimization - if current page is not
1937 # "full", ie. does not contain PAGE_SIZE video ids then
1938 # we can assume that this page is the last one - there
1939 # are no more ids on further pages - no need to query
1940 # again.
1941
1942 if len(ids_in_page) < self._PAGE_SIZE:
1943 break
1944
1945 pagenum += 1
1946
1947 all_ids_count = len(video_ids)
1948 playliststart = self._downloader.params.get('playliststart', 1) - 1
1949 playlistend = self._downloader.params.get('playlistend', -1)
1950
1951 if playlistend == -1:
1952 video_ids = video_ids[playliststart:]
1953 else:
1954 video_ids = video_ids[playliststart:playlistend]
1955
1956 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1957 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1958
1959 for video_id in video_ids:
1960 self._downloader.download([u'http://blip.tv/'+video_id])
1961
1962
1963 class DepositFilesIE(InfoExtractor):
1964 """Information extractor for depositfiles.com"""
1965
1966 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1967 IE_NAME = u'DepositFiles'
1968
1969 def __init__(self, downloader=None):
1970 InfoExtractor.__init__(self, downloader)
1971
1972 def report_download_webpage(self, file_id):
1973 """Report webpage download."""
1974 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1975
1976 def report_extraction(self, file_id):
1977 """Report information extraction."""
1978 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1979
1980 def _real_extract(self, url):
1981 file_id = url.split('/')[-1]
1982 # Rebuild url in english locale
1983 url = 'http://depositfiles.com/en/files/' + file_id
1984
1985 # Retrieve file webpage with 'Free download' button pressed
1986 free_download_indication = { 'gateway_result' : '1' }
1987 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1988 try:
1989 self.report_download_webpage(file_id)
1990 webpage = compat_urllib_request.urlopen(request).read()
1991 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1992 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1993 return
1994
1995 # Search for the real file URL
1996 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1997 if (mobj is None) or (mobj.group(1) is None):
1998 # Try to figure out reason of the error.
1999 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2000 if (mobj is not None) and (mobj.group(1) is not None):
2001 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2002 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2003 else:
2004 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2005 return
2006
2007 file_url = mobj.group(1)
2008 file_extension = os.path.splitext(file_url)[1][1:]
2009
2010 # Search for file title
2011 mobj = re.search(r'<b title="(.*?)">', webpage)
2012 if mobj is None:
2013 self._downloader.trouble(u'ERROR: unable to extract title')
2014 return
2015 file_title = mobj.group(1).decode('utf-8')
2016
2017 return [{
2018 'id': file_id.decode('utf-8'),
2019 'url': file_url.decode('utf-8'),
2020 'uploader': None,
2021 'upload_date': None,
2022 'title': file_title,
2023 'ext': file_extension.decode('utf-8'),
2024 }]
2025
2026
2027 class FacebookIE(InfoExtractor):
2028 """Information Extractor for Facebook"""
2029
2030 _WORKING = False
2031 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2032 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2033 _NETRC_MACHINE = 'facebook'
2034 _available_formats = ['video', 'highqual', 'lowqual']
2035 _video_extensions = {
2036 'video': 'mp4',
2037 'highqual': 'mp4',
2038 'lowqual': 'mp4',
2039 }
2040 IE_NAME = u'facebook'
2041
2042 def __init__(self, downloader=None):
2043 InfoExtractor.__init__(self, downloader)
2044
2045 def _reporter(self, message):
2046 """Add header and report message."""
2047 self._downloader.to_screen(u'[facebook] %s' % message)
2048
2049 def report_login(self):
2050 """Report attempt to log in."""
2051 self._reporter(u'Logging in')
2052
2053 def report_video_webpage_download(self, video_id):
2054 """Report attempt to download video webpage."""
2055 self._reporter(u'%s: Downloading video webpage' % video_id)
2056
2057 def report_information_extraction(self, video_id):
2058 """Report attempt to extract video information."""
2059 self._reporter(u'%s: Extracting video information' % video_id)
2060
2061 def _parse_page(self, video_webpage):
2062 """Extract video information from page"""
2063 # General data
2064 data = {'title': r'\("video_title", "(.*?)"\)',
2065 'description': r'<div class="datawrap">(.*?)</div>',
2066 'owner': r'\("video_owner_name", "(.*?)"\)',
2067 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2068 }
2069 video_info = {}
2070 for piece in data.keys():
2071 mobj = re.search(data[piece], video_webpage)
2072 if mobj is not None:
2073 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2074
2075 # Video urls
2076 video_urls = {}
2077 for fmt in self._available_formats:
2078 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2079 if mobj is not None:
2080 # URL is in a Javascript segment inside an escaped Unicode format within
2081 # the generally utf-8 page
2082 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2083 video_info['video_urls'] = video_urls
2084
2085 return video_info
2086
2087 def _real_initialize(self):
2088 if self._downloader is None:
2089 return
2090
2091 useremail = None
2092 password = None
2093 downloader_params = self._downloader.params
2094
2095 # Attempt to use provided username and password or .netrc data
2096 if downloader_params.get('username', None) is not None:
2097 useremail = downloader_params['username']
2098 password = downloader_params['password']
2099 elif downloader_params.get('usenetrc', False):
2100 try:
2101 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2102 if info is not None:
2103 useremail = info[0]
2104 password = info[2]
2105 else:
2106 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2107 except (IOError, netrc.NetrcParseError) as err:
2108 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2109 return
2110
2111 if useremail is None:
2112 return
2113
2114 # Log in
2115 login_form = {
2116 'email': useremail,
2117 'pass': password,
2118 'login': 'Log+In'
2119 }
2120 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2121 try:
2122 self.report_login()
2123 login_results = compat_urllib_request.urlopen(request).read()
2124 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2125 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2126 return
2127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2128 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2129 return
2130
2131 def _real_extract(self, url):
2132 mobj = re.match(self._VALID_URL, url)
2133 if mobj is None:
2134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2135 return
2136 video_id = mobj.group('ID')
2137
2138 # Get video webpage
2139 self.report_video_webpage_download(video_id)
2140 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2141 try:
2142 page = compat_urllib_request.urlopen(request)
2143 video_webpage = page.read()
2144 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2145 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2146 return
2147
2148 # Start extracting information
2149 self.report_information_extraction(video_id)
2150
2151 # Extract information
2152 video_info = self._parse_page(video_webpage)
2153
2154 # uploader
2155 if 'owner' not in video_info:
2156 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2157 return
2158 video_uploader = video_info['owner']
2159
2160 # title
2161 if 'title' not in video_info:
2162 self._downloader.trouble(u'ERROR: unable to extract video title')
2163 return
2164 video_title = video_info['title']
2165 video_title = video_title.decode('utf-8')
2166
2167 # thumbnail image
2168 if 'thumbnail' not in video_info:
2169 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2170 video_thumbnail = ''
2171 else:
2172 video_thumbnail = video_info['thumbnail']
2173
2174 # upload date
2175 upload_date = None
2176 if 'upload_date' in video_info:
2177 upload_time = video_info['upload_date']
2178 timetuple = email.utils.parsedate_tz(upload_time)
2179 if timetuple is not None:
2180 try:
2181 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2182 except:
2183 pass
2184
2185 # description
2186 video_description = video_info.get('description', 'No description available.')
2187
2188 url_map = video_info['video_urls']
2189 if len(url_map.keys()) > 0:
2190 # Decide which formats to download
2191 req_format = self._downloader.params.get('format', None)
2192 format_limit = self._downloader.params.get('format_limit', None)
2193
2194 if format_limit is not None and format_limit in self._available_formats:
2195 format_list = self._available_formats[self._available_formats.index(format_limit):]
2196 else:
2197 format_list = self._available_formats
2198 existing_formats = [x for x in format_list if x in url_map]
2199 if len(existing_formats) == 0:
2200 self._downloader.trouble(u'ERROR: no known formats available for video')
2201 return
2202 if req_format is None:
2203 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2204 elif req_format == 'worst':
2205 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2206 elif req_format == '-1':
2207 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2208 else:
2209 # Specific format
2210 if req_format not in url_map:
2211 self._downloader.trouble(u'ERROR: requested format not available')
2212 return
2213 video_url_list = [(req_format, url_map[req_format])] # Specific format
2214
2215 results = []
2216 for format_param, video_real_url in video_url_list:
2217 # Extension
2218 video_extension = self._video_extensions.get(format_param, 'mp4')
2219
2220 results.append({
2221 'id': video_id.decode('utf-8'),
2222 'url': video_real_url.decode('utf-8'),
2223 'uploader': video_uploader.decode('utf-8'),
2224 'upload_date': upload_date,
2225 'title': video_title,
2226 'ext': video_extension.decode('utf-8'),
2227 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2228 'thumbnail': video_thumbnail.decode('utf-8'),
2229 'description': video_description.decode('utf-8'),
2230 })
2231 return results
2232
2233 class BlipTVIE(InfoExtractor):
2234 """Information extractor for blip.tv"""
2235
2236 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2237 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2238 IE_NAME = u'blip.tv'
2239
2240 def report_extraction(self, file_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2243
2244 def report_direct_download(self, title):
2245 """Report information extraction."""
2246 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2247
2248 def _real_extract(self, url):
2249 mobj = re.match(self._VALID_URL, url)
2250 if mobj is None:
2251 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2252 return
2253
2254 if '?' in url:
2255 cchar = '&'
2256 else:
2257 cchar = '?'
2258 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2259 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2260 self.report_extraction(mobj.group(1))
2261 info = None
2262 try:
2263 urlh = compat_urllib_request.urlopen(request)
2264 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2265 basename = url.split('/')[-1]
2266 title,ext = os.path.splitext(basename)
2267 title = title.decode('UTF-8')
2268 ext = ext.replace('.', '')
2269 self.report_direct_download(title)
2270 info = {
2271 'id': title,
2272 'url': url,
2273 'uploader': None,
2274 'upload_date': None,
2275 'title': title,
2276 'ext': ext,
2277 'urlhandle': urlh
2278 }
2279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2280 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2281 return
2282 if info is None: # Regular URL
2283 try:
2284 json_code = urlh.read()
2285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2286 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2287 return
2288
2289 try:
2290 json_data = json.loads(json_code)
2291 if 'Post' in json_data:
2292 data = json_data['Post']
2293 else:
2294 data = json_data
2295
2296 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2297 video_url = data['media']['url']
2298 umobj = re.match(self._URL_EXT, video_url)
2299 if umobj is None:
2300 raise ValueError('Can not determine filename extension')
2301 ext = umobj.group(1)
2302
2303 info = {
2304 'id': data['item_id'],
2305 'url': video_url,
2306 'uploader': data['display_name'],
2307 'upload_date': upload_date,
2308 'title': data['title'],
2309 'ext': ext,
2310 'format': data['media']['mimeType'],
2311 'thumbnail': data['thumbnailUrl'],
2312 'description': data['description'],
2313 'player_url': data['embedUrl']
2314 }
2315 except (ValueError,KeyError) as err:
2316 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2317 return
2318
2319 std_headers['User-Agent'] = 'iTunes/10.6.1'
2320 return [info]
2321
2322
2323 class MyVideoIE(InfoExtractor):
2324 """Information Extractor for myvideo.de."""
2325
2326 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2327 IE_NAME = u'myvideo'
2328
2329 def __init__(self, downloader=None):
2330 InfoExtractor.__init__(self, downloader)
2331
2332 def report_download_webpage(self, video_id):
2333 """Report webpage download."""
2334 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2335
2336 def report_extraction(self, video_id):
2337 """Report information extraction."""
2338 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2339
2340 def _real_extract(self,url):
2341 mobj = re.match(self._VALID_URL, url)
2342 if mobj is None:
2343 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2344 return
2345
2346 video_id = mobj.group(1)
2347
2348 # Get video webpage
2349 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2350 try:
2351 self.report_download_webpage(video_id)
2352 webpage = compat_urllib_request.urlopen(request).read()
2353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2355 return
2356
2357 self.report_extraction(video_id)
2358 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2359 webpage)
2360 if mobj is None:
2361 self._downloader.trouble(u'ERROR: unable to extract media URL')
2362 return
2363 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2364
2365 mobj = re.search('<title>([^<]+)</title>', webpage)
2366 if mobj is None:
2367 self._downloader.trouble(u'ERROR: unable to extract title')
2368 return
2369
2370 video_title = mobj.group(1)
2371
2372 return [{
2373 'id': video_id,
2374 'url': video_url,
2375 'uploader': None,
2376 'upload_date': None,
2377 'title': video_title,
2378 'ext': u'flv',
2379 }]
2380
2381 class ComedyCentralIE(InfoExtractor):
2382 """Information extractor for The Daily Show and Colbert Report """
2383
2384 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2385 IE_NAME = u'comedycentral'
2386
2387 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2388
2389 _video_extensions = {
2390 '3500': 'mp4',
2391 '2200': 'mp4',
2392 '1700': 'mp4',
2393 '1200': 'mp4',
2394 '750': 'mp4',
2395 '400': 'mp4',
2396 }
2397 _video_dimensions = {
2398 '3500': '1280x720',
2399 '2200': '960x540',
2400 '1700': '768x432',
2401 '1200': '640x360',
2402 '750': '512x288',
2403 '400': '384x216',
2404 }
2405
2406 def report_extraction(self, episode_id):
2407 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2408
2409 def report_config_download(self, episode_id):
2410 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2411
2412 def report_index_download(self, episode_id):
2413 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2414
2415 def report_player_url(self, episode_id):
2416 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2417
2418
2419 def _print_formats(self, formats):
2420 print('Available formats:')
2421 for x in formats:
2422 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2423
2424
2425 def _real_extract(self, url):
2426 mobj = re.match(self._VALID_URL, url)
2427 if mobj is None:
2428 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2429 return
2430
2431 if mobj.group('shortname'):
2432 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2433 url = u'http://www.thedailyshow.com/full-episodes/'
2434 else:
2435 url = u'http://www.colbertnation.com/full-episodes/'
2436 mobj = re.match(self._VALID_URL, url)
2437 assert mobj is not None
2438
2439 dlNewest = not mobj.group('episode')
2440 if dlNewest:
2441 epTitle = mobj.group('showname')
2442 else:
2443 epTitle = mobj.group('episode')
2444
2445 req = compat_urllib_request.Request(url)
2446 self.report_extraction(epTitle)
2447 try:
2448 htmlHandle = compat_urllib_request.urlopen(req)
2449 html = htmlHandle.read()
2450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2452 return
2453 if dlNewest:
2454 url = htmlHandle.geturl()
2455 mobj = re.match(self._VALID_URL, url)
2456 if mobj is None:
2457 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2458 return
2459 if mobj.group('episode') == '':
2460 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2461 return
2462 epTitle = mobj.group('episode')
2463
2464 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2465
2466 if len(mMovieParams) == 0:
2467 # The Colbert Report embeds the information in a without
2468 # a URL prefix; so extract the alternate reference
2469 # and then add the URL prefix manually.
2470
2471 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2472 if len(altMovieParams) == 0:
2473 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2474 return
2475 else:
2476 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2477
2478 playerUrl_raw = mMovieParams[0][0]
2479 self.report_player_url(epTitle)
2480 try:
2481 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2482 playerUrl = urlHandle.geturl()
2483 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2485 return
2486
2487 uri = mMovieParams[0][1]
2488 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2489 self.report_index_download(epTitle)
2490 try:
2491 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2493 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2494 return
2495
2496 results = []
2497
2498 idoc = xml.etree.ElementTree.fromstring(indexXml)
2499 itemEls = idoc.findall('.//item')
2500 for itemEl in itemEls:
2501 mediaId = itemEl.findall('./guid')[0].text
2502 shortMediaId = mediaId.split(':')[-1]
2503 showId = mediaId.split(':')[-2].replace('.com', '')
2504 officialTitle = itemEl.findall('./title')[0].text
2505 officialDate = itemEl.findall('./pubDate')[0].text
2506
2507 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2508 compat_urllib_parse.urlencode({'uri': mediaId}))
2509 configReq = compat_urllib_request.Request(configUrl)
2510 self.report_config_download(epTitle)
2511 try:
2512 configXml = compat_urllib_request.urlopen(configReq).read()
2513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2514 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2515 return
2516
2517 cdoc = xml.etree.ElementTree.fromstring(configXml)
2518 turls = []
2519 for rendition in cdoc.findall('.//rendition'):
2520 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2521 turls.append(finfo)
2522
2523 if len(turls) == 0:
2524 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2525 continue
2526
2527 if self._downloader.params.get('listformats', None):
2528 self._print_formats([i[0] for i in turls])
2529 return
2530
2531 # For now, just pick the highest bitrate
2532 format,video_url = turls[-1]
2533
2534 # Get the format arg from the arg stream
2535 req_format = self._downloader.params.get('format', None)
2536
2537 # Select format if we can find one
2538 for f,v in turls:
2539 if f == req_format:
2540 format, video_url = f, v
2541 break
2542
2543 # Patch to download from alternative CDN, which does not
2544 # break on current RTMPDump builds
2545 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2546 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2547
2548 if video_url.startswith(broken_cdn):
2549 video_url = video_url.replace(broken_cdn, better_cdn)
2550
2551 effTitle = showId + u'-' + epTitle
2552 info = {
2553 'id': shortMediaId,
2554 'url': video_url,
2555 'uploader': showId,
2556 'upload_date': officialDate,
2557 'title': effTitle,
2558 'ext': 'mp4',
2559 'format': format,
2560 'thumbnail': None,
2561 'description': officialTitle,
2562 'player_url': None #playerUrl
2563 }
2564
2565 results.append(info)
2566
2567 return results
2568
2569
2570 class EscapistIE(InfoExtractor):
2571 """Information extractor for The Escapist """
2572
2573 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2574 IE_NAME = u'escapist'
2575
2576 def report_extraction(self, showName):
2577 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2578
2579 def report_config_download(self, showName):
2580 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2581
2582 def _real_extract(self, url):
2583 mobj = re.match(self._VALID_URL, url)
2584 if mobj is None:
2585 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2586 return
2587 showName = mobj.group('showname')
2588 videoId = mobj.group('episode')
2589
2590 self.report_extraction(showName)
2591 try:
2592 webPage = compat_urllib_request.urlopen(url)
2593 webPageBytes = webPage.read()
2594 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2595 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2598 return
2599
2600 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2601 description = unescapeHTML(descMatch.group(1))
2602 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2603 imgUrl = unescapeHTML(imgMatch.group(1))
2604 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2605 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2606 configUrlMatch = re.search('config=(.*)$', playerUrl)
2607 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2608
2609 self.report_config_download(showName)
2610 try:
2611 configJSON = compat_urllib_request.urlopen(configUrl).read()
2612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2613 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2614 return
2615
2616 # Technically, it's JavaScript, not JSON
2617 configJSON = configJSON.replace("'", '"')
2618
2619 try:
2620 config = json.loads(configJSON)
2621 except (ValueError,) as err:
2622 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2623 return
2624
2625 playlist = config['playlist']
2626 videoUrl = playlist[1]['url']
2627
2628 info = {
2629 'id': videoId,
2630 'url': videoUrl,
2631 'uploader': showName,
2632 'upload_date': None,
2633 'title': showName,
2634 'ext': 'flv',
2635 'thumbnail': imgUrl,
2636 'description': description,
2637 'player_url': playerUrl,
2638 }
2639
2640 return [info]
2641
2642
2643 class CollegeHumorIE(InfoExtractor):
2644 """Information extractor for collegehumor.com"""
2645
2646 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2647 IE_NAME = u'collegehumor'
2648
2649 def report_webpage(self, video_id):
2650 """Report information extraction."""
2651 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2652
2653 def report_extraction(self, video_id):
2654 """Report information extraction."""
2655 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2656
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2659 if mobj is None:
2660 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2661 return
2662 video_id = mobj.group('videoid')
2663
2664 self.report_webpage(video_id)
2665 request = compat_urllib_request.Request(url)
2666 try:
2667 webpage = compat_urllib_request.urlopen(request).read()
2668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2670 return
2671
2672 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2673 if m is None:
2674 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2675 return
2676 internal_video_id = m.group('internalvideoid')
2677
2678 info = {
2679 'id': video_id,
2680 'internal_id': internal_video_id,
2681 'uploader': None,
2682 'upload_date': None,
2683 }
2684
2685 self.report_extraction(video_id)
2686 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2687 try:
2688 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2690 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2691 return
2692
2693 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2694 try:
2695 videoNode = mdoc.findall('./video')[0]
2696 info['description'] = videoNode.findall('./description')[0].text
2697 info['title'] = videoNode.findall('./caption')[0].text
2698 info['url'] = videoNode.findall('./file')[0].text
2699 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2700 info['ext'] = info['url'].rpartition('.')[2]
2701 except IndexError:
2702 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2703 return
2704
2705 return [info]
2706
2707
2708 class XVideosIE(InfoExtractor):
2709 """Information extractor for xvideos.com"""
2710
2711 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2712 IE_NAME = u'xvideos'
2713
2714 def report_webpage(self, video_id):
2715 """Report information extraction."""
2716 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2717
2718 def report_extraction(self, video_id):
2719 """Report information extraction."""
2720 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2721
2722 def _real_extract(self, url):
2723 mobj = re.match(self._VALID_URL, url)
2724 if mobj is None:
2725 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2726 return
2727 video_id = mobj.group(1).decode('utf-8')
2728
2729 self.report_webpage(video_id)
2730
2731 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2732 try:
2733 webpage = compat_urllib_request.urlopen(request).read()
2734 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2735 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2736 return
2737
2738 self.report_extraction(video_id)
2739
2740
2741 # Extract video URL
2742 mobj = re.search(r'flv_url=(.+?)&', webpage)
2743 if mobj is None:
2744 self._downloader.trouble(u'ERROR: unable to extract video url')
2745 return
2746 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2747
2748
2749 # Extract title
2750 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2751 if mobj is None:
2752 self._downloader.trouble(u'ERROR: unable to extract video title')
2753 return
2754 video_title = mobj.group(1).decode('utf-8')
2755
2756
2757 # Extract video thumbnail
2758 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2759 if mobj is None:
2760 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2761 return
2762 video_thumbnail = mobj.group(0).decode('utf-8')
2763
2764 info = {
2765 'id': video_id,
2766 'url': video_url,
2767 'uploader': None,
2768 'upload_date': None,
2769 'title': video_title,
2770 'ext': 'flv',
2771 'thumbnail': video_thumbnail,
2772 'description': None,
2773 }
2774
2775 return [info]
2776
2777
2778 class SoundcloudIE(InfoExtractor):
2779 """Information extractor for soundcloud.com
2780 To access the media, the uid of the song and a stream token
2781 must be extracted from the page source and the script must make
2782 a request to media.soundcloud.com/crossdomain.xml. Then
2783 the media can be grabbed by requesting from an url composed
2784 of the stream token and uid
2785 """
2786
2787 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2788 IE_NAME = u'soundcloud'
2789
2790 def __init__(self, downloader=None):
2791 InfoExtractor.__init__(self, downloader)
2792
2793 def report_webpage(self, video_id):
2794 """Report information extraction."""
2795 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2796
2797 def report_extraction(self, video_id):
2798 """Report information extraction."""
2799 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2800
2801 def _real_extract(self, url):
2802 mobj = re.match(self._VALID_URL, url)
2803 if mobj is None:
2804 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805 return
2806
2807 # extract uploader (which is in the url)
2808 uploader = mobj.group(1).decode('utf-8')
2809 # extract simple title (uploader + slug of song title)
2810 slug_title = mobj.group(2).decode('utf-8')
2811 simple_title = uploader + u'-' + slug_title
2812
2813 self.report_webpage('%s/%s' % (uploader, slug_title))
2814
2815 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2816 try:
2817 webpage = compat_urllib_request.urlopen(request).read()
2818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2820 return
2821
2822 self.report_extraction('%s/%s' % (uploader, slug_title))
2823
2824 # extract uid and stream token that soundcloud hands out for access
2825 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2826 if mobj:
2827 video_id = mobj.group(1)
2828 stream_token = mobj.group(2)
2829
2830 # extract unsimplified title
2831 mobj = re.search('"title":"(.*?)",', webpage)
2832 if mobj:
2833 title = mobj.group(1).decode('utf-8')
2834 else:
2835 title = simple_title
2836
2837 # construct media url (with uid/token)
2838 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2839 mediaURL = mediaURL % (video_id, stream_token)
2840
2841 # description
2842 description = u'No description available'
2843 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2844 if mobj:
2845 description = mobj.group(1)
2846
2847 # upload date
2848 upload_date = None
2849 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2850 if mobj:
2851 try:
2852 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2853 except Exception as err:
2854 self._downloader.to_stderr(compat_str(err))
2855
2856 # for soundcloud, a request to a cross domain is required for cookies
2857 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2858
2859 return [{
2860 'id': video_id.decode('utf-8'),
2861 'url': mediaURL,
2862 'uploader': uploader.decode('utf-8'),
2863 'upload_date': upload_date,
2864 'title': title,
2865 'ext': u'mp3',
2866 'description': description.decode('utf-8')
2867 }]
2868
2869
2870 class InfoQIE(InfoExtractor):
2871 """Information extractor for infoq.com"""
2872
2873 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2874 IE_NAME = u'infoq'
2875
2876 def report_webpage(self, video_id):
2877 """Report information extraction."""
2878 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2879
2880 def report_extraction(self, video_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2883
2884 def _real_extract(self, url):
2885 mobj = re.match(self._VALID_URL, url)
2886 if mobj is None:
2887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2888 return
2889
2890 self.report_webpage(url)
2891
2892 request = compat_urllib_request.Request(url)
2893 try:
2894 webpage = compat_urllib_request.urlopen(request).read()
2895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2897 return
2898
2899 self.report_extraction(url)
2900
2901
2902 # Extract video URL
2903 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2904 if mobj is None:
2905 self._downloader.trouble(u'ERROR: unable to extract video url')
2906 return
2907 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2908
2909
2910 # Extract title
2911 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2912 if mobj is None:
2913 self._downloader.trouble(u'ERROR: unable to extract video title')
2914 return
2915 video_title = mobj.group(1).decode('utf-8')
2916
2917 # Extract description
2918 video_description = u'No description available.'
2919 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2920 if mobj is not None:
2921 video_description = mobj.group(1).decode('utf-8')
2922
2923 video_filename = video_url.split('/')[-1]
2924 video_id, extension = video_filename.split('.')
2925
2926 info = {
2927 'id': video_id,
2928 'url': video_url,
2929 'uploader': None,
2930 'upload_date': None,
2931 'title': video_title,
2932 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2933 'thumbnail': None,
2934 'description': video_description,
2935 }
2936
2937 return [info]
2938
2939 class MixcloudIE(InfoExtractor):
2940 """Information extractor for www.mixcloud.com"""
2941 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2942 IE_NAME = u'mixcloud'
2943
2944 def __init__(self, downloader=None):
2945 InfoExtractor.__init__(self, downloader)
2946
2947 def report_download_json(self, file_id):
2948 """Report JSON download."""
2949 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2950
2951 def report_extraction(self, file_id):
2952 """Report information extraction."""
2953 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2954
2955 def get_urls(self, jsonData, fmt, bitrate='best'):
2956 """Get urls from 'audio_formats' section in json"""
2957 file_url = None
2958 try:
2959 bitrate_list = jsonData[fmt]
2960 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2961 bitrate = max(bitrate_list) # select highest
2962
2963 url_list = jsonData[fmt][bitrate]
2964 except TypeError: # we have no bitrate info.
2965 url_list = jsonData[fmt]
2966 return url_list
2967
2968 def check_urls(self, url_list):
2969 """Returns 1st active url from list"""
2970 for url in url_list:
2971 try:
2972 compat_urllib_request.urlopen(url)
2973 return url
2974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2975 url = None
2976
2977 return None
2978
2979 def _print_formats(self, formats):
2980 print('Available formats:')
2981 for fmt in formats.keys():
2982 for b in formats[fmt]:
2983 try:
2984 ext = formats[fmt][b][0]
2985 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2986 except TypeError: # we have no bitrate info
2987 ext = formats[fmt][0]
2988 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2989 break
2990
2991 def _real_extract(self, url):
2992 mobj = re.match(self._VALID_URL, url)
2993 if mobj is None:
2994 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2995 return
2996 # extract uploader & filename from url
2997 uploader = mobj.group(1).decode('utf-8')
2998 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2999
3000 # construct API request
3001 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3002 # retrieve .json file with links to files
3003 request = compat_urllib_request.Request(file_url)
3004 try:
3005 self.report_download_json(file_url)
3006 jsonData = compat_urllib_request.urlopen(request).read()
3007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3008 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3009 return
3010
3011 # parse JSON
3012 json_data = json.loads(jsonData)
3013 player_url = json_data['player_swf_url']
3014 formats = dict(json_data['audio_formats'])
3015
3016 req_format = self._downloader.params.get('format', None)
3017 bitrate = None
3018
3019 if self._downloader.params.get('listformats', None):
3020 self._print_formats(formats)
3021 return
3022
3023 if req_format is None or req_format == 'best':
3024 for format_param in formats.keys():
3025 url_list = self.get_urls(formats, format_param)
3026 # check urls
3027 file_url = self.check_urls(url_list)
3028 if file_url is not None:
3029 break # got it!
3030 else:
3031 if req_format not in formats.keys():
3032 self._downloader.trouble(u'ERROR: format is not available')
3033 return
3034
3035 url_list = self.get_urls(formats, req_format)
3036 file_url = self.check_urls(url_list)
3037 format_param = req_format
3038
3039 return [{
3040 'id': file_id.decode('utf-8'),
3041 'url': file_url.decode('utf-8'),
3042 'uploader': uploader.decode('utf-8'),
3043 'upload_date': None,
3044 'title': json_data['name'],
3045 'ext': file_url.split('.')[-1].decode('utf-8'),
3046 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3047 'thumbnail': json_data['thumbnail_url'],
3048 'description': json_data['description'],
3049 'player_url': player_url.decode('utf-8'),
3050 }]
3051
3052 class StanfordOpenClassroomIE(InfoExtractor):
3053 """Information extractor for Stanford's Open ClassRoom"""
3054
3055 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3056 IE_NAME = u'stanfordoc'
3057
3058 def report_download_webpage(self, objid):
3059 """Report information extraction."""
3060 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3061
3062 def report_extraction(self, video_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3065
3066 def _real_extract(self, url):
3067 mobj = re.match(self._VALID_URL, url)
3068 if mobj is None:
3069 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070 return
3071
3072 if mobj.group('course') and mobj.group('video'): # A specific video
3073 course = mobj.group('course')
3074 video = mobj.group('video')
3075 info = {
3076 'id': course + '_' + video,
3077 'uploader': None,
3078 'upload_date': None,
3079 }
3080
3081 self.report_extraction(info['id'])
3082 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3083 xmlUrl = baseUrl + video + '.xml'
3084 try:
3085 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3086 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3087 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3088 return
3089 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3090 try:
3091 info['title'] = mdoc.findall('./title')[0].text
3092 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3093 except IndexError:
3094 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3095 return
3096 info['ext'] = info['url'].rpartition('.')[2]
3097 return [info]
3098 elif mobj.group('course'): # A course page
3099 course = mobj.group('course')
3100 info = {
3101 'id': course,
3102 'type': 'playlist',
3103 'uploader': None,
3104 'upload_date': None,
3105 }
3106
3107 self.report_download_webpage(info['id'])
3108 try:
3109 coursepage = compat_urllib_request.urlopen(url).read()
3110 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3111 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3112 return
3113
3114 m = re.search('<h1>([^<]+)</h1>', coursepage)
3115 if m:
3116 info['title'] = unescapeHTML(m.group(1))
3117 else:
3118 info['title'] = info['id']
3119
3120 m = re.search('<description>([^<]+)</description>', coursepage)
3121 if m:
3122 info['description'] = unescapeHTML(m.group(1))
3123
3124 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3125 info['list'] = [
3126 {
3127 'type': 'reference',
3128 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3129 }
3130 for vpage in links]
3131 results = []
3132 for entry in info['list']:
3133 assert entry['type'] == 'reference'
3134 results += self.extract(entry['url'])
3135 return results
3136
3137 else: # Root page
3138 info = {
3139 'id': 'Stanford OpenClassroom',
3140 'type': 'playlist',
3141 'uploader': None,
3142 'upload_date': None,
3143 }
3144
3145 self.report_download_webpage(info['id'])
3146 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3147 try:
3148 rootpage = compat_urllib_request.urlopen(rootURL).read()
3149 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3150 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3151 return
3152
3153 info['title'] = info['id']
3154
3155 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3156 info['list'] = [
3157 {
3158 'type': 'reference',
3159 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3160 }
3161 for cpage in links]
3162
3163 results = []
3164 for entry in info['list']:
3165 assert entry['type'] == 'reference'
3166 results += self.extract(entry['url'])
3167 return results
3168
3169 class MTVIE(InfoExtractor):
3170 """Information extractor for MTV.com"""
3171
3172 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3173 IE_NAME = u'mtv'
3174
3175 def report_webpage(self, video_id):
3176 """Report information extraction."""
3177 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3178
3179 def report_extraction(self, video_id):
3180 """Report information extraction."""
3181 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3182
3183 def _real_extract(self, url):
3184 mobj = re.match(self._VALID_URL, url)
3185 if mobj is None:
3186 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3187 return
3188 if not mobj.group('proto'):
3189 url = 'http://' + url
3190 video_id = mobj.group('videoid')
3191 self.report_webpage(video_id)
3192
3193 request = compat_urllib_request.Request(url)
3194 try:
3195 webpage = compat_urllib_request.urlopen(request).read()
3196 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3197 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3198 return
3199
3200 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3201 if mobj is None:
3202 self._downloader.trouble(u'ERROR: unable to extract song name')
3203 return
3204 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3205 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3206 if mobj is None:
3207 self._downloader.trouble(u'ERROR: unable to extract performer')
3208 return
3209 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3210 video_title = performer + ' - ' + song_name
3211
3212 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3213 if mobj is None:
3214 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3215 return
3216 mtvn_uri = mobj.group(1)
3217
3218 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3219 if mobj is None:
3220 self._downloader.trouble(u'ERROR: unable to extract content id')
3221 return
3222 content_id = mobj.group(1)
3223
3224 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3225 self.report_extraction(video_id)
3226 request = compat_urllib_request.Request(videogen_url)
3227 try:
3228 metadataXml = compat_urllib_request.urlopen(request).read()
3229 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3230 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3231 return
3232
3233 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3234 renditions = mdoc.findall('.//rendition')
3235
3236 # For now, always pick the highest quality.
3237 rendition = renditions[-1]
3238
3239 try:
3240 _,_,ext = rendition.attrib['type'].partition('/')
3241 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3242 video_url = rendition.find('./src').text
3243 except KeyError:
3244 self._downloader.trouble('Invalid rendition field.')
3245 return
3246
3247 info = {
3248 'id': video_id,
3249 'url': video_url,
3250 'uploader': performer,
3251 'upload_date': None,
3252 'title': video_title,
3253 'ext': ext,
3254 'format': format,
3255 }
3256
3257 return [info]
3258
3259
3260 class YoukuIE(InfoExtractor):
3261
3262 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3263 IE_NAME = u'Youku'
3264
3265 def __init__(self, downloader=None):
3266 InfoExtractor.__init__(self, downloader)
3267
3268 def report_download_webpage(self, file_id):
3269 """Report webpage download."""
3270 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3271
3272 def report_extraction(self, file_id):
3273 """Report information extraction."""
3274 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3275
3276 def _gen_sid(self):
3277 nowTime = int(time.time() * 1000)
3278 random1 = random.randint(1000,1998)
3279 random2 = random.randint(1000,9999)
3280
3281 return "%d%d%d" %(nowTime,random1,random2)
3282
3283 def _get_file_ID_mix_string(self, seed):
3284 mixed = []
3285 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3286 seed = float(seed)
3287 for i in range(len(source)):
3288 seed = (seed * 211 + 30031 ) % 65536
3289 index = math.floor(seed / 65536 * len(source) )
3290 mixed.append(source[int(index)])
3291 source.remove(source[int(index)])
3292 #return ''.join(mixed)
3293 return mixed
3294
3295 def _get_file_id(self, fileId, seed):
3296 mixed = self._get_file_ID_mix_string(seed)
3297 ids = fileId.split('*')
3298 realId = []
3299 for ch in ids:
3300 if ch:
3301 realId.append(mixed[int(ch)])
3302 return ''.join(realId)
3303
3304 def _real_extract(self, url):
3305 mobj = re.match(self._VALID_URL, url)
3306 if mobj is None:
3307 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3308 return
3309 video_id = mobj.group('ID')
3310
3311 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3312
3313 request = compat_urllib_request.Request(info_url, None, std_headers)
3314 try:
3315 self.report_download_webpage(video_id)
3316 jsondata = compat_urllib_request.urlopen(request).read()
3317 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3318 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3319 return
3320
3321 self.report_extraction(video_id)
3322 try:
3323 config = json.loads(jsondata)
3324
3325 video_title = config['data'][0]['title']
3326 seed = config['data'][0]['seed']
3327
3328 format = self._downloader.params.get('format', None)
3329 supported_format = config['data'][0]['streamfileids'].keys()
3330
3331 if format is None or format == 'best':
3332 if 'hd2' in supported_format:
3333 format = 'hd2'
3334 else:
3335 format = 'flv'
3336 ext = u'flv'
3337 elif format == 'worst':
3338 format = 'mp4'
3339 ext = u'mp4'
3340 else:
3341 format = 'flv'
3342 ext = u'flv'
3343
3344
3345 fileid = config['data'][0]['streamfileids'][format]
3346 seg_number = len(config['data'][0]['segs'][format])
3347
3348 keys=[]
3349 for i in xrange(seg_number):
3350 keys.append(config['data'][0]['segs'][format][i]['k'])
3351
3352 #TODO check error
3353 #youku only could be viewed from mainland china
3354 except:
3355 self._downloader.trouble(u'ERROR: unable to extract info section')
3356 return
3357
3358 files_info=[]
3359 sid = self._gen_sid()
3360 fileid = self._get_file_id(fileid, seed)
3361
3362 #column 8,9 of fileid represent the segment number
3363 #fileid[7:9] should be changed
3364 for index, key in enumerate(keys):
3365
3366 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3367 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3368
3369 info = {
3370 'id': '%s_part%02d' % (video_id, index),
3371 'url': download_url,
3372 'uploader': None,
3373 'upload_date': None,
3374 'title': video_title,
3375 'ext': ext,
3376 }
3377 files_info.append(info)
3378
3379 return files_info
3380
3381
3382 class XNXXIE(InfoExtractor):
3383 """Information extractor for xnxx.com"""
3384
3385 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3386 IE_NAME = u'xnxx'
3387 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3388 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3389 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3390
3391 def report_webpage(self, video_id):
3392 """Report information extraction"""
3393 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3394
3395 def report_extraction(self, video_id):
3396 """Report information extraction"""
3397 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3398
3399 def _real_extract(self, url):
3400 mobj = re.match(self._VALID_URL, url)
3401 if mobj is None:
3402 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403 return
3404 video_id = mobj.group(1).decode('utf-8')
3405
3406 self.report_webpage(video_id)
3407
3408 # Get webpage content
3409 try:
3410 webpage = compat_urllib_request.urlopen(url).read()
3411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3413 return
3414
3415 result = re.search(self.VIDEO_URL_RE, webpage)
3416 if result is None:
3417 self._downloader.trouble(u'ERROR: unable to extract video url')
3418 return
3419 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3420
3421 result = re.search(self.VIDEO_TITLE_RE, webpage)
3422 if result is None:
3423 self._downloader.trouble(u'ERROR: unable to extract video title')
3424 return
3425 video_title = result.group(1).decode('utf-8')
3426
3427 result = re.search(self.VIDEO_THUMB_RE, webpage)
3428 if result is None:
3429 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3430 return
3431 video_thumbnail = result.group(1).decode('utf-8')
3432
3433 return [{
3434 'id': video_id,
3435 'url': video_url,
3436 'uploader': None,
3437 'upload_date': None,
3438 'title': video_title,
3439 'ext': 'flv',
3440 'thumbnail': video_thumbnail,
3441 'description': None,
3442 }]
3443
3444
3445 class GooglePlusIE(InfoExtractor):
3446 """Information extractor for plus.google.com."""
3447
3448 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3449 IE_NAME = u'plus.google'
3450
3451 def __init__(self, downloader=None):
3452 InfoExtractor.__init__(self, downloader)
3453
3454 def report_extract_entry(self, url):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3457
3458 def report_date(self, upload_date):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3461
3462 def report_uploader(self, uploader):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3465
3466 def report_title(self, video_title):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3469
3470 def report_extract_vid_page(self, video_page):
3471 """Report information extraction."""
3472 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3473
3474 def _real_extract(self, url):
3475 # Extract id from URL
3476 mobj = re.match(self._VALID_URL, url)
3477 if mobj is None:
3478 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3479 return
3480
3481 post_url = mobj.group(0)
3482 video_id = mobj.group(2)
3483
3484 video_extension = 'flv'
3485
3486 # Step 1, Retrieve post webpage to extract further information
3487 self.report_extract_entry(post_url)
3488 request = compat_urllib_request.Request(post_url)
3489 try:
3490 webpage = compat_urllib_request.urlopen(request).read()
3491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3492 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3493 return
3494
3495 # Extract update date
3496 upload_date = None
3497 pattern = 'title="Timestamp">(.*?)</a>'
3498 mobj = re.search(pattern, webpage)
3499 if mobj:
3500 upload_date = mobj.group(1)
3501 # Convert timestring to a format suitable for filename
3502 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3503 upload_date = upload_date.strftime('%Y%m%d')
3504 self.report_date(upload_date)
3505
3506 # Extract uploader
3507 uploader = None
3508 pattern = r'rel\="author".*?>(.*?)</a>'
3509 mobj = re.search(pattern, webpage)
3510 if mobj:
3511 uploader = mobj.group(1)
3512 self.report_uploader(uploader)
3513
3514 # Extract title
3515 # Get the first line for title
3516 video_title = u'NA'
3517 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3518 mobj = re.search(pattern, webpage)
3519 if mobj:
3520 video_title = mobj.group(1)
3521 self.report_title(video_title)
3522
3523 # Step 2, Stimulate clicking the image box to launch video
3524 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3525 mobj = re.search(pattern, webpage)
3526 if mobj is None:
3527 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3528
3529 video_page = mobj.group(1)
3530 request = compat_urllib_request.Request(video_page)
3531 try:
3532 webpage = compat_urllib_request.urlopen(request).read()
3533 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3535 return
3536 self.report_extract_vid_page(video_page)
3537
3538
3539 # Extract video links on video page
3540 """Extract video links of all sizes"""
3541 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3542 mobj = re.findall(pattern, webpage)
3543 if len(mobj) == 0:
3544 self._downloader.trouble(u'ERROR: unable to extract video links')
3545
3546 # Sort in resolution
3547 links = sorted(mobj)
3548
3549 # Choose the lowest of the sort, i.e. highest resolution
3550 video_url = links[-1]
3551 # Only get the url. The resolution part in the tuple has no use anymore
3552 video_url = video_url[-1]
3553 # Treat escaped \u0026 style hex
3554 video_url = unicode(video_url, "unicode_escape")
3555
3556
3557 return [{
3558 'id': video_id.decode('utf-8'),
3559 'url': video_url,
3560 'uploader': uploader.decode('utf-8'),
3561 'upload_date': upload_date.decode('utf-8'),
3562 'title': video_title.decode('utf-8'),
3563 'ext': video_extension.decode('utf-8'),
3564 }]