]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
44b2472c2677334b935f9e0723d53d43cd915a5e
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21 import cStringIO as StringIO
22 except ImportError:
23 import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
38
39 The dictionaries must include the following fields:
40
41 id: Video identifier.
42 url: Final video URL.
43 uploader: Nickname of the video uploader.
44 title: Video title, unescaped.
45 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional:
49
50 format: The video format, defaults to ext. Used by --get-format
51 thumbnail: Full URL to a video thumbnail image.
52 description One-line video description.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57
58 _real_extract() must return a *list* of information dictionaries as
59 described above.
60 """
61
62 _ready = False
63 _downloader = None
64
65 def __init__(self, downloader=None):
66 """Constructor. Receives an optional downloader."""
67 self._ready = False
68 self.set_downloader(downloader)
69
70 def suitable(self, url):
71 """Receives a URL and returns True if suitable for this IE."""
72 return re.match(self._VALID_URL, url) is not None
73
74 def initialize(self):
75 """Initializes an instance (authentication, etc)."""
76 if not self._ready:
77 self._real_initialize()
78 self._ready = True
79
80 def extract(self, url):
81 """Extracts URL information and returns it in list of dicts."""
82 self.initialize()
83 return self._real_extract(url)
84
85 def set_downloader(self, downloader):
86 """Sets the downloader for this IE."""
87 self._downloader = downloader
88
89 def _real_initialize(self):
90 """Real initialization process. Redefine in subclasses."""
91 pass
92
93 def _real_extract(self, url):
94 """Real extraction process. Redefine in subclasses."""
95 pass
96
97
98 class YoutubeIE(InfoExtractor):
99 """Information extractor for youtube.com."""
100
101 _VALID_URL = r"""^
102 (
103 (?:https?://)? # http(s):// (optional)
104 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
105 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
106 (?:.*?\#/)? # handle anchor (#/) redirect urls
107 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
108 (?: # the various things that can precede the ID:
109 (?:(?:v|embed|e)/) # v/ or embed/ or e/
110 |(?: # or the v= param in all its forms
111 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
112 (?:\?|\#!?) # the params delimiter ? or # or #!
113 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
114 v=
115 )
116 )? # optional -> youtube.com/xxxx is OK
117 )? # all until now is optional -> you can pass the naked ID
118 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
119 (?(1).+)? # if we found the ID, everything can follow
120 $"""
121 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
122 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
123 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
124 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
125 _NETRC_MACHINE = 'youtube'
126 # Listed in order of quality
127 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
128 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
129 _video_extensions = {
130 '13': '3gp',
131 '17': 'mp4',
132 '18': 'mp4',
133 '22': 'mp4',
134 '37': 'mp4',
135 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
136 '43': 'webm',
137 '44': 'webm',
138 '45': 'webm',
139 '46': 'webm',
140 }
141 _video_dimensions = {
142 '5': '240x400',
143 '6': '???',
144 '13': '???',
145 '17': '144x176',
146 '18': '360x640',
147 '22': '720x1280',
148 '34': '360x640',
149 '35': '480x854',
150 '37': '1080x1920',
151 '38': '3072x4096',
152 '43': '360x640',
153 '44': '480x854',
154 '45': '720x1280',
155 '46': '1080x1920',
156 }
157 IE_NAME = u'youtube'
158
159 def suitable(self, url):
160 """Receives a URL and returns True if suitable for this IE."""
161 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
162
163 def report_lang(self):
164 """Report attempt to set language."""
165 self._downloader.to_screen(u'[youtube] Setting language')
166
167 def report_login(self):
168 """Report attempt to log in."""
169 self._downloader.to_screen(u'[youtube] Logging in')
170
171 def report_age_confirmation(self):
172 """Report attempt to confirm age."""
173 self._downloader.to_screen(u'[youtube] Confirming age')
174
175 def report_video_webpage_download(self, video_id):
176 """Report attempt to download video webpage."""
177 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
178
179 def report_video_info_webpage_download(self, video_id):
180 """Report attempt to download video info webpage."""
181 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
182
183 def report_video_subtitles_download(self, video_id):
184 """Report attempt to download video info webpage."""
185 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
186
187 def report_information_extraction(self, video_id):
188 """Report attempt to extract video information."""
189 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
190
191 def report_unavailable_format(self, video_id, format):
192 """Report extracted video URL."""
193 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
194
195 def report_rtmp_download(self):
196 """Indicate the download will use the RTMP protocol."""
197 self._downloader.to_screen(u'[youtube] RTMP download detected')
198
199 def _closed_captions_xml_to_srt(self, xml_string):
200 srt = ''
201 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
202 # TODO parse xml instead of regex
203 for n, (start, dur_tag, dur, caption) in enumerate(texts):
204 if not dur: dur = '4'
205 start = float(start)
206 end = start + float(dur)
207 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
208 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
209 caption = unescapeHTML(caption)
210 caption = unescapeHTML(caption) # double cycle, intentional
211 srt += str(n+1) + '\n'
212 srt += start + ' --> ' + end + '\n'
213 srt += caption + '\n\n'
214 return srt
215
216 def _print_formats(self, formats):
217 print('Available formats:')
218 for x in formats:
219 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
220
221 def _real_initialize(self):
222 if self._downloader is None:
223 return
224
225 username = None
226 password = None
227 downloader_params = self._downloader.params
228
229 # Attempt to use provided username and password or .netrc data
230 if downloader_params.get('username', None) is not None:
231 username = downloader_params['username']
232 password = downloader_params['password']
233 elif downloader_params.get('usenetrc', False):
234 try:
235 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
236 if info is not None:
237 username = info[0]
238 password = info[2]
239 else:
240 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
241 except (IOError, netrc.NetrcParseError), err:
242 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
243 return
244
245 # Set language
246 request = urllib2.Request(self._LANG_URL)
247 try:
248 self.report_lang()
249 urllib2.urlopen(request).read()
250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
251 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
252 return
253
254 # No authentication to be performed
255 if username is None:
256 return
257
258 # Log in
259 login_form = {
260 'current_form': 'loginForm',
261 'next': '/',
262 'action_login': 'Log In',
263 'username': username,
264 'password': password,
265 }
266 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267 try:
268 self.report_login()
269 login_results = urllib2.urlopen(request).read()
270 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
271 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
272 return
273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
275 return
276
277 # Confirm age
278 age_form = {
279 'next_url': '/',
280 'action_confirm': 'Confirm',
281 }
282 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
283 try:
284 self.report_age_confirmation()
285 age_results = urllib2.urlopen(request).read()
286 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
287 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
288 return
289
290 def _real_extract(self, url):
291 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
292 mobj = re.search(self._NEXT_URL_RE, url)
293 if mobj:
294 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
295
296 # Extract video id from URL
297 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
298 if mobj is None:
299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
300 return
301 video_id = mobj.group(2)
302
303 # Get video webpage
304 self.report_video_webpage_download(video_id)
305 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
306 try:
307 video_webpage = urllib2.urlopen(request).read()
308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
310 return
311
312 # Attempt to extract SWF player URL
313 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
314 if mobj is not None:
315 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
316 else:
317 player_url = None
318
319 # Get video info
320 self.report_video_info_webpage_download(video_id)
321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
322 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
323 % (video_id, el_type))
324 request = urllib2.Request(video_info_url)
325 try:
326 video_info_webpage = urllib2.urlopen(request).read()
327 video_info = parse_qs(video_info_webpage)
328 if 'token' in video_info:
329 break
330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
331 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
332 return
333 if 'token' not in video_info:
334 if 'reason' in video_info:
335 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
336 else:
337 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338 return
339
340 # Check for "rental" videos
341 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
342 self._downloader.trouble(u'ERROR: "rental" videos not supported')
343 return
344
345 # Start extracting information
346 self.report_information_extraction(video_id)
347
348 # uploader
349 if 'author' not in video_info:
350 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
351 return
352 video_uploader = urllib.unquote_plus(video_info['author'][0])
353
354 # title
355 if 'title' not in video_info:
356 self._downloader.trouble(u'ERROR: unable to extract video title')
357 return
358 video_title = urllib.unquote_plus(video_info['title'][0])
359 video_title = video_title.decode('utf-8')
360
361 # thumbnail image
362 if 'thumbnail_url' not in video_info:
363 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
364 video_thumbnail = ''
365 else: # don't panic if we can't find it
366 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
367
368 # upload date
369 upload_date = u'NA'
370 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
371 if mobj is not None:
372 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
373 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
374 for expression in format_expressions:
375 try:
376 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
377 except:
378 pass
379
380 # description
381 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
382 if video_description: video_description = clean_html(video_description)
383 else: video_description = ''
384
385 # closed captions
386 video_subtitles = None
387 if self._downloader.params.get('writesubtitles', False):
388 try:
389 self.report_video_subtitles_download(video_id)
390 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
391 try:
392 srt_list = urllib2.urlopen(request).read()
393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
395 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
396 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
397 if not srt_lang_list:
398 raise Trouble(u'WARNING: video has no closed captions')
399 if self._downloader.params.get('subtitleslang', False):
400 srt_lang = self._downloader.params.get('subtitleslang')
401 elif 'en' in srt_lang_list:
402 srt_lang = 'en'
403 else:
404 srt_lang = srt_lang_list.keys()[0]
405 if not srt_lang in srt_lang_list:
406 raise Trouble(u'WARNING: no closed captions found in the specified language')
407 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
408 try:
409 srt_xml = urllib2.urlopen(request).read()
410 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
411 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
412 if not srt_xml:
413 raise Trouble(u'WARNING: unable to download video subtitles')
414 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
415 except Trouble as trouble:
416 self._downloader.trouble(trouble[0])
417
418 if 'length_seconds' not in video_info:
419 self._downloader.trouble(u'WARNING: unable to extract video duration')
420 video_duration = ''
421 else:
422 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
423
424 # token
425 video_token = urllib.unquote_plus(video_info['token'][0])
426
427 # Decide which formats to download
428 req_format = self._downloader.params.get('format', None)
429
430 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
431 self.report_rtmp_download()
432 video_url_list = [(None, video_info['conn'][0])]
433 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
434 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
435 url_data = [parse_qs(uds) for uds in url_data_strs]
436 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
437 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
438
439 format_limit = self._downloader.params.get('format_limit', None)
440 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
441 if format_limit is not None and format_limit in available_formats:
442 format_list = available_formats[available_formats.index(format_limit):]
443 else:
444 format_list = available_formats
445 existing_formats = [x for x in format_list if x in url_map]
446 if len(existing_formats) == 0:
447 self._downloader.trouble(u'ERROR: no known formats available for video')
448 return
449 if self._downloader.params.get('listformats', None):
450 self._print_formats(existing_formats)
451 return
452 if req_format is None or req_format == 'best':
453 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
454 elif req_format == 'worst':
455 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
456 elif req_format in ('-1', 'all'):
457 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
458 else:
459 # Specific formats. We pick the first in a slash-delimeted sequence.
460 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
461 req_formats = req_format.split('/')
462 video_url_list = None
463 for rf in req_formats:
464 if rf in url_map:
465 video_url_list = [(rf, url_map[rf])]
466 break
467 if video_url_list is None:
468 self._downloader.trouble(u'ERROR: requested format not available')
469 return
470 else:
471 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
472 return
473
474 results = []
475 for format_param, video_real_url in video_url_list:
476 # Extension
477 video_extension = self._video_extensions.get(format_param, 'flv')
478
479 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
480 self._video_dimensions.get(format_param, '???'))
481
482 results.append({
483 'id': video_id.decode('utf-8'),
484 'url': video_real_url.decode('utf-8'),
485 'uploader': video_uploader.decode('utf-8'),
486 'upload_date': upload_date,
487 'title': video_title,
488 'ext': video_extension.decode('utf-8'),
489 'format': video_format,
490 'thumbnail': video_thumbnail.decode('utf-8'),
491 'description': video_description,
492 'player_url': player_url,
493 'subtitles': video_subtitles,
494 'duration': video_duration
495 })
496 return results
497
498
499 class MetacafeIE(InfoExtractor):
500 """Information Extractor for metacafe.com."""
501
502 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
503 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
504 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
505 IE_NAME = u'metacafe'
506
507 def __init__(self, downloader=None):
508 InfoExtractor.__init__(self, downloader)
509
510 def report_disclaimer(self):
511 """Report disclaimer retrieval."""
512 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
513
514 def report_age_confirmation(self):
515 """Report attempt to confirm age."""
516 self._downloader.to_screen(u'[metacafe] Confirming age')
517
518 def report_download_webpage(self, video_id):
519 """Report webpage download."""
520 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
521
522 def report_extraction(self, video_id):
523 """Report information extraction."""
524 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
525
526 def _real_initialize(self):
527 # Retrieve disclaimer
528 request = urllib2.Request(self._DISCLAIMER)
529 try:
530 self.report_disclaimer()
531 disclaimer = urllib2.urlopen(request).read()
532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
533 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
534 return
535
536 # Confirm age
537 disclaimer_form = {
538 'filters': '0',
539 'submit': "Continue - I'm over 18",
540 }
541 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
542 try:
543 self.report_age_confirmation()
544 disclaimer = urllib2.urlopen(request).read()
545 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
546 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
547 return
548
549 def _real_extract(self, url):
550 # Extract id and simplified title from URL
551 mobj = re.match(self._VALID_URL, url)
552 if mobj is None:
553 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554 return
555
556 video_id = mobj.group(1)
557
558 # Check if video comes from YouTube
559 mobj2 = re.match(r'^yt-(.*)$', video_id)
560 if mobj2 is not None:
561 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
562 return
563
564 # Retrieve video webpage to extract further information
565 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
566 try:
567 self.report_download_webpage(video_id)
568 webpage = urllib2.urlopen(request).read()
569 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
570 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
571 return
572
573 # Extract URL, uploader and title from webpage
574 self.report_extraction(video_id)
575 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
576 if mobj is not None:
577 mediaURL = urllib.unquote(mobj.group(1))
578 video_extension = mediaURL[-3:]
579
580 # Extract gdaKey if available
581 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
582 if mobj is None:
583 video_url = mediaURL
584 else:
585 gdaKey = mobj.group(1)
586 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
587 else:
588 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
589 if mobj is None:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
591 return
592 vardict = parse_qs(mobj.group(1))
593 if 'mediaData' not in vardict:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
597 if mobj is None:
598 self._downloader.trouble(u'ERROR: unable to extract media URL')
599 return
600 mediaURL = mobj.group(1).replace('\\/', '/')
601 video_extension = mediaURL[-3:]
602 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
603
604 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
605 if mobj is None:
606 self._downloader.trouble(u'ERROR: unable to extract title')
607 return
608 video_title = mobj.group(1).decode('utf-8')
609
610 mobj = re.search(r'submitter=(.*?);', webpage)
611 if mobj is None:
612 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
613 return
614 video_uploader = mobj.group(1)
615
616 return [{
617 'id': video_id.decode('utf-8'),
618 'url': video_url.decode('utf-8'),
619 'uploader': video_uploader.decode('utf-8'),
620 'upload_date': u'NA',
621 'title': video_title,
622 'ext': video_extension.decode('utf-8'),
623 'player_url': None,
624 }]
625
626
627 class DailymotionIE(InfoExtractor):
628 """Information Extractor for Dailymotion"""
629
630 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
631 IE_NAME = u'dailymotion'
632
633 def __init__(self, downloader=None):
634 InfoExtractor.__init__(self, downloader)
635
636 def report_download_webpage(self, video_id):
637 """Report webpage download."""
638 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
639
640 def report_extraction(self, video_id):
641 """Report information extraction."""
642 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
643
644 def _real_extract(self, url):
645 # Extract id and simplified title from URL
646 mobj = re.match(self._VALID_URL, url)
647 if mobj is None:
648 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
649 return
650
651 video_id = mobj.group(1).split('_')[0].split('?')[0]
652
653 video_extension = 'mp4'
654
655 # Retrieve video webpage to extract further information
656 request = urllib2.Request(url)
657 request.add_header('Cookie', 'family_filter=off')
658 try:
659 self.report_download_webpage(video_id)
660 webpage = urllib2.urlopen(request).read()
661 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
662 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663 return
664
665 # Extract URL, uploader and title from webpage
666 self.report_extraction(video_id)
667 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
668 if mobj is None:
669 self._downloader.trouble(u'ERROR: unable to extract media URL')
670 return
671 flashvars = urllib.unquote(mobj.group(1))
672
673 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
674 if key in flashvars:
675 max_quality = key
676 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
677 break
678 else:
679 self._downloader.trouble(u'ERROR: unable to extract video URL')
680 return
681
682 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
683 if mobj is None:
684 self._downloader.trouble(u'ERROR: unable to extract video URL')
685 return
686
687 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
688
689 # TODO: support choosing qualities
690
691 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
692 if mobj is None:
693 self._downloader.trouble(u'ERROR: unable to extract title')
694 return
695 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
696
697 video_uploader = u'NA'
698 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
699 if mobj is None:
700 # lookin for official user
701 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
702 if mobj_official is None:
703 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
704 else:
705 video_uploader = mobj_official.group(1)
706 else:
707 video_uploader = mobj.group(1)
708
709 video_upload_date = u'NA'
710 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
711 if mobj is not None:
712 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
713
714 return [{
715 'id': video_id.decode('utf-8'),
716 'url': video_url.decode('utf-8'),
717 'uploader': video_uploader.decode('utf-8'),
718 'upload_date': video_upload_date,
719 'title': video_title,
720 'ext': video_extension.decode('utf-8'),
721 'player_url': None,
722 }]
723
724
725 class GoogleIE(InfoExtractor):
726 """Information extractor for video.google.com."""
727
728 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
729 IE_NAME = u'video.google'
730
731 def __init__(self, downloader=None):
732 InfoExtractor.__init__(self, downloader)
733
734 def report_download_webpage(self, video_id):
735 """Report webpage download."""
736 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
737
738 def report_extraction(self, video_id):
739 """Report information extraction."""
740 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
741
742 def _real_extract(self, url):
743 # Extract id from URL
744 mobj = re.match(self._VALID_URL, url)
745 if mobj is None:
746 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
747 return
748
749 video_id = mobj.group(1)
750
751 video_extension = 'mp4'
752
753 # Retrieve video webpage to extract further information
754 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
755 try:
756 self.report_download_webpage(video_id)
757 webpage = urllib2.urlopen(request).read()
758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
760 return
761
762 # Extract URL, uploader, and title from webpage
763 self.report_extraction(video_id)
764 mobj = re.search(r"download_url:'([^']+)'", webpage)
765 if mobj is None:
766 video_extension = 'flv'
767 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
768 if mobj is None:
769 self._downloader.trouble(u'ERROR: unable to extract media URL')
770 return
771 mediaURL = urllib.unquote(mobj.group(1))
772 mediaURL = mediaURL.replace('\\x3d', '\x3d')
773 mediaURL = mediaURL.replace('\\x26', '\x26')
774
775 video_url = mediaURL
776
777 mobj = re.search(r'<title>(.*)</title>', webpage)
778 if mobj is None:
779 self._downloader.trouble(u'ERROR: unable to extract title')
780 return
781 video_title = mobj.group(1).decode('utf-8')
782
783 # Extract video description
784 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
785 if mobj is None:
786 self._downloader.trouble(u'ERROR: unable to extract video description')
787 return
788 video_description = mobj.group(1).decode('utf-8')
789 if not video_description:
790 video_description = 'No description available.'
791
792 # Extract video thumbnail
793 if self._downloader.params.get('forcethumbnail', False):
794 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
795 try:
796 webpage = urllib2.urlopen(request).read()
797 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
798 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
799 return
800 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
801 if mobj is None:
802 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
803 return
804 video_thumbnail = mobj.group(1)
805 else: # we need something to pass to process_info
806 video_thumbnail = ''
807
808 return [{
809 'id': video_id.decode('utf-8'),
810 'url': video_url.decode('utf-8'),
811 'uploader': u'NA',
812 'upload_date': u'NA',
813 'title': video_title,
814 'ext': video_extension.decode('utf-8'),
815 'player_url': None,
816 }]
817
818
819 class PhotobucketIE(InfoExtractor):
820 """Information extractor for photobucket.com."""
821
822 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
823 IE_NAME = u'photobucket'
824
825 def __init__(self, downloader=None):
826 InfoExtractor.__init__(self, downloader)
827
828 def report_download_webpage(self, video_id):
829 """Report webpage download."""
830 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
831
832 def report_extraction(self, video_id):
833 """Report information extraction."""
834 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
835
836 def _real_extract(self, url):
837 # Extract id from URL
838 mobj = re.match(self._VALID_URL, url)
839 if mobj is None:
840 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
841 return
842
843 video_id = mobj.group(1)
844
845 video_extension = 'flv'
846
847 # Retrieve video webpage to extract further information
848 request = urllib2.Request(url)
849 try:
850 self.report_download_webpage(video_id)
851 webpage = urllib2.urlopen(request).read()
852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
853 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
854 return
855
856 # Extract URL, uploader, and title from webpage
857 self.report_extraction(video_id)
858 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
859 if mobj is None:
860 self._downloader.trouble(u'ERROR: unable to extract media URL')
861 return
862 mediaURL = urllib.unquote(mobj.group(1))
863
864 video_url = mediaURL
865
866 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
867 if mobj is None:
868 self._downloader.trouble(u'ERROR: unable to extract title')
869 return
870 video_title = mobj.group(1).decode('utf-8')
871
872 video_uploader = mobj.group(2).decode('utf-8')
873
874 return [{
875 'id': video_id.decode('utf-8'),
876 'url': video_url.decode('utf-8'),
877 'uploader': video_uploader,
878 'upload_date': u'NA',
879 'title': video_title,
880 'ext': video_extension.decode('utf-8'),
881 'player_url': None,
882 }]
883
884
885 class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
887
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
893
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
896
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
910 return
911
912 video_id = mobj.group(2)
913 video_extension = 'flv'
914
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = urllib2.Request(url)
919 try:
920 webpage = urllib2.urlopen(request).read()
921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
923 return
924
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
928 return
929 yahoo_id = mobj.group(1)
930
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934 return
935 yahoo_vid = mobj.group(1)
936
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
939
940 # Retrieve video webpage to extract further information
941 request = urllib2.Request(url)
942 try:
943 self.report_download_webpage(video_id)
944 webpage = urllib2.urlopen(request).read()
945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
947 return
948
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952 if mobj is None:
953 self._downloader.trouble(u'ERROR: unable to extract video title')
954 return
955 video_title = mobj.group(1).decode('utf-8')
956
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
960 return
961 video_uploader = mobj.group(1).decode('utf-8')
962
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965 if mobj is None:
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967 return
968 video_thumbnail = mobj.group(1).decode('utf-8')
969
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video description')
974 return
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
978
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: unable to extract video height')
983 return
984 yv_video_height = mobj.group(1)
985
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video width')
989 return
990 yv_video_width = mobj.group(1)
991
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000 try:
1001 self.report_download_webpage(video_id)
1002 webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005 return
1006
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009 if mobj is None:
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011 return
1012 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013 video_url = unescapeHTML(video_url)
1014
1015 return [{
1016 'id': video_id.decode('utf-8'),
1017 'url': video_url,
1018 'uploader': video_uploader,
1019 'upload_date': u'NA',
1020 'title': video_title,
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1024 'thumbnail': video_thumbnail,
1025 'player_url': None,
1026 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1031
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034 IE_NAME = u'vimeo'
1035
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1038
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1050 if mobj is None:
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052 return
1053
1054 video_id = mobj.group(1)
1055
1056 # Retrieve video webpage to extract further information
1057 request = urllib2.Request(url, None, std_headers)
1058 try:
1059 self.report_download_webpage(video_id)
1060 webpage = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1063 return
1064
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1069
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072 try:
1073 config = json.loads(config)
1074 except:
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1076 return
1077
1078 # Extract title
1079 video_title = config["video"]["title"]
1080
1081 # Extract uploader
1082 video_uploader = config["video"]["owner"]["name"]
1083
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1086
1087 # Extract video description
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1091
1092 # Extract upload date
1093 video_upload_date = u'NA'
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1097
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1101
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1113 else:
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122 break
1123 else:
1124 self._downloader.trouble(u'ERROR: no known codec found')
1125 return
1126
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130 return [{
1131 'id': video_id,
1132 'url': video_url,
1133 'uploader': video_uploader,
1134 'upload_date': video_upload_date,
1135 'title': video_title,
1136 'ext': video_extension,
1137 'thumbnail': video_thumbnail,
1138 'description': video_description,
1139 'player_url': None,
1140 }]
1141
1142
1143 class GenericIE(InfoExtractor):
1144 """Generic last-resort information extractor."""
1145
1146 _VALID_URL = r'.*'
1147 IE_NAME = u'generic'
1148
1149 def __init__(self, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1151
1152 def report_download_webpage(self, video_id):
1153 """Report webpage download."""
1154 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1155 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1156
1157 def report_extraction(self, video_id):
1158 """Report information extraction."""
1159 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1160
1161 def report_following_redirect(self, new_url):
1162 """Report information extraction."""
1163 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1164
1165 def _test_redirect(self, url):
1166 """Check if it is a redirect, like url shorteners, in case restart chain."""
1167 class HeadRequest(urllib2.Request):
1168 def get_method(self):
1169 return "HEAD"
1170
1171 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1172 """
1173 Subclass the HTTPRedirectHandler to make it use our
1174 HeadRequest also on the redirected URL
1175 """
1176 def redirect_request(self, req, fp, code, msg, headers, newurl):
1177 if code in (301, 302, 303, 307):
1178 newurl = newurl.replace(' ', '%20')
1179 newheaders = dict((k,v) for k,v in req.headers.items()
1180 if k.lower() not in ("content-length", "content-type"))
1181 return HeadRequest(newurl,
1182 headers=newheaders,
1183 origin_req_host=req.get_origin_req_host(),
1184 unverifiable=True)
1185 else:
1186 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1187
1188 class HTTPMethodFallback(urllib2.BaseHandler):
1189 """
1190 Fallback to GET if HEAD is not allowed (405 HTTP error)
1191 """
1192 def http_error_405(self, req, fp, code, msg, headers):
1193 fp.read()
1194 fp.close()
1195
1196 newheaders = dict((k,v) for k,v in req.headers.items()
1197 if k.lower() not in ("content-length", "content-type"))
1198 return self.parent.open(urllib2.Request(req.get_full_url(),
1199 headers=newheaders,
1200 origin_req_host=req.get_origin_req_host(),
1201 unverifiable=True))
1202
1203 # Build our opener
1204 opener = urllib2.OpenerDirector()
1205 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1206 HTTPMethodFallback, HEADRedirectHandler,
1207 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1208 opener.add_handler(handler())
1209
1210 response = opener.open(HeadRequest(url))
1211 new_url = response.geturl()
1212
1213 if url == new_url: return False
1214
1215 self.report_following_redirect(new_url)
1216 self._downloader.download([new_url])
1217 return True
1218
1219 def _real_extract(self, url):
1220 if self._test_redirect(url): return
1221
1222 video_id = url.split('/')[-1]
1223 request = urllib2.Request(url)
1224 try:
1225 self.report_download_webpage(video_id)
1226 webpage = urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1229 return
1230 except ValueError, err:
1231 # since this is the last-resort InfoExtractor, if
1232 # this error is thrown, it'll be thrown here
1233 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1234 return
1235
1236 self.report_extraction(video_id)
1237 # Start with something easy: JW Player in SWFObject
1238 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1239 if mobj is None:
1240 # Broaden the search a little bit
1241 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1242 if mobj is None:
1243 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1244 return
1245
1246 # It's possible that one of the regexes
1247 # matched, but returned an empty group:
1248 if mobj.group(1) is None:
1249 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1250 return
1251
1252 video_url = urllib.unquote(mobj.group(1))
1253 video_id = os.path.basename(video_url)
1254
1255 # here's a fun little line of code for you:
1256 video_extension = os.path.splitext(video_id)[1][1:]
1257 video_id = os.path.splitext(video_id)[0]
1258
1259 # it's tempting to parse this further, but you would
1260 # have to take into account all the variations like
1261 # Video Title - Site Name
1262 # Site Name | Video Title
1263 # Video Title - Tagline | Site Name
1264 # and so on and so forth; it's just not practical
1265 mobj = re.search(r'<title>(.*)</title>', webpage)
1266 if mobj is None:
1267 self._downloader.trouble(u'ERROR: unable to extract title')
1268 return
1269 video_title = mobj.group(1).decode('utf-8')
1270
1271 # video uploader is domain name
1272 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1273 if mobj is None:
1274 self._downloader.trouble(u'ERROR: unable to extract title')
1275 return
1276 video_uploader = mobj.group(1).decode('utf-8')
1277
1278 return [{
1279 'id': video_id.decode('utf-8'),
1280 'url': video_url.decode('utf-8'),
1281 'uploader': video_uploader,
1282 'upload_date': u'NA',
1283 'title': video_title,
1284 'ext': video_extension.decode('utf-8'),
1285 'player_url': None,
1286 }]
1287
1288
1289 class YoutubeSearchIE(InfoExtractor):
1290 """Information Extractor for YouTube search queries."""
1291 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1292 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1293 _max_youtube_results = 1000
1294 IE_NAME = u'youtube:search'
1295
1296 def __init__(self, downloader=None):
1297 InfoExtractor.__init__(self, downloader)
1298
1299 def report_download_page(self, query, pagenum):
1300 """Report attempt to download search page with given number."""
1301 query = query.decode(preferredencoding())
1302 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1303
1304 def _real_extract(self, query):
1305 mobj = re.match(self._VALID_URL, query)
1306 if mobj is None:
1307 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1308 return
1309
1310 prefix, query = query.split(':')
1311 prefix = prefix[8:]
1312 query = query.encode('utf-8')
1313 if prefix == '':
1314 self._download_n_results(query, 1)
1315 return
1316 elif prefix == 'all':
1317 self._download_n_results(query, self._max_youtube_results)
1318 return
1319 else:
1320 try:
1321 n = long(prefix)
1322 if n <= 0:
1323 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1324 return
1325 elif n > self._max_youtube_results:
1326 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1327 n = self._max_youtube_results
1328 self._download_n_results(query, n)
1329 return
1330 except ValueError: # parsing prefix as integer fails
1331 self._download_n_results(query, 1)
1332 return
1333
1334 def _download_n_results(self, query, n):
1335 """Downloads a specified number of results for a query"""
1336
1337 video_ids = []
1338 pagenum = 0
1339 limit = n
1340
1341 while (50 * pagenum) < limit:
1342 self.report_download_page(query, pagenum+1)
1343 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1344 request = urllib2.Request(result_url)
1345 try:
1346 data = urllib2.urlopen(request).read()
1347 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1348 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1349 return
1350 api_response = json.loads(data)['data']
1351
1352 new_ids = list(video['id'] for video in api_response['items'])
1353 video_ids += new_ids
1354
1355 limit = min(n, api_response['totalItems'])
1356 pagenum += 1
1357
1358 if len(video_ids) > n:
1359 video_ids = video_ids[:n]
1360 for id in video_ids:
1361 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1362 return
1363
1364
1365 class GoogleSearchIE(InfoExtractor):
1366 """Information Extractor for Google Video search queries."""
1367 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1368 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1369 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1370 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1371 _max_google_results = 1000
1372 IE_NAME = u'video.google:search'
1373
1374 def __init__(self, downloader=None):
1375 InfoExtractor.__init__(self, downloader)
1376
1377 def report_download_page(self, query, pagenum):
1378 """Report attempt to download playlist page with given number."""
1379 query = query.decode(preferredencoding())
1380 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1381
1382 def _real_extract(self, query):
1383 mobj = re.match(self._VALID_URL, query)
1384 if mobj is None:
1385 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1386 return
1387
1388 prefix, query = query.split(':')
1389 prefix = prefix[8:]
1390 query = query.encode('utf-8')
1391 if prefix == '':
1392 self._download_n_results(query, 1)
1393 return
1394 elif prefix == 'all':
1395 self._download_n_results(query, self._max_google_results)
1396 return
1397 else:
1398 try:
1399 n = long(prefix)
1400 if n <= 0:
1401 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1402 return
1403 elif n > self._max_google_results:
1404 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1405 n = self._max_google_results
1406 self._download_n_results(query, n)
1407 return
1408 except ValueError: # parsing prefix as integer fails
1409 self._download_n_results(query, 1)
1410 return
1411
1412 def _download_n_results(self, query, n):
1413 """Downloads a specified number of results for a query"""
1414
1415 video_ids = []
1416 pagenum = 0
1417
1418 while True:
1419 self.report_download_page(query, pagenum)
1420 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1421 request = urllib2.Request(result_url)
1422 try:
1423 page = urllib2.urlopen(request).read()
1424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1426 return
1427
1428 # Extract video identifiers
1429 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1430 video_id = mobj.group(1)
1431 if video_id not in video_ids:
1432 video_ids.append(video_id)
1433 if len(video_ids) == n:
1434 # Specified n videos reached
1435 for id in video_ids:
1436 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1437 return
1438
1439 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1440 for id in video_ids:
1441 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1442 return
1443
1444 pagenum = pagenum + 1
1445
1446
1447 class YahooSearchIE(InfoExtractor):
1448 """Information Extractor for Yahoo! Video search queries."""
1449 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1450 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1451 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1452 _MORE_PAGES_INDICATOR = r'\s*Next'
1453 _max_yahoo_results = 1000
1454 IE_NAME = u'video.yahoo:search'
1455
1456 def __init__(self, downloader=None):
1457 InfoExtractor.__init__(self, downloader)
1458
1459 def report_download_page(self, query, pagenum):
1460 """Report attempt to download playlist page with given number."""
1461 query = query.decode(preferredencoding())
1462 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1463
1464 def _real_extract(self, query):
1465 mobj = re.match(self._VALID_URL, query)
1466 if mobj is None:
1467 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1468 return
1469
1470 prefix, query = query.split(':')
1471 prefix = prefix[8:]
1472 query = query.encode('utf-8')
1473 if prefix == '':
1474 self._download_n_results(query, 1)
1475 return
1476 elif prefix == 'all':
1477 self._download_n_results(query, self._max_yahoo_results)
1478 return
1479 else:
1480 try:
1481 n = long(prefix)
1482 if n <= 0:
1483 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1484 return
1485 elif n > self._max_yahoo_results:
1486 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1487 n = self._max_yahoo_results
1488 self._download_n_results(query, n)
1489 return
1490 except ValueError: # parsing prefix as integer fails
1491 self._download_n_results(query, 1)
1492 return
1493
1494 def _download_n_results(self, query, n):
1495 """Downloads a specified number of results for a query"""
1496
1497 video_ids = []
1498 already_seen = set()
1499 pagenum = 1
1500
1501 while True:
1502 self.report_download_page(query, pagenum)
1503 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1504 request = urllib2.Request(result_url)
1505 try:
1506 page = urllib2.urlopen(request).read()
1507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509 return
1510
1511 # Extract video identifiers
1512 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513 video_id = mobj.group(1)
1514 if video_id not in already_seen:
1515 video_ids.append(video_id)
1516 already_seen.add(video_id)
1517 if len(video_ids) == n:
1518 # Specified n videos reached
1519 for id in video_ids:
1520 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1521 return
1522
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 for id in video_ids:
1525 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1526 return
1527
1528 pagenum = pagenum + 1
1529
1530
1531 class YoutubePlaylistIE(InfoExtractor):
1532 """Information Extractor for YouTube playlists."""
1533
1534 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1535 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1536 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1537 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1538 IE_NAME = u'youtube:playlist'
1539
1540 def __init__(self, downloader=None):
1541 InfoExtractor.__init__(self, downloader)
1542
1543 def report_download_page(self, playlist_id, pagenum):
1544 """Report attempt to download playlist page with given number."""
1545 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1546
1547 def _real_extract(self, url):
1548 # Extract playlist id
1549 mobj = re.match(self._VALID_URL, url)
1550 if mobj is None:
1551 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1552 return
1553
1554 # Single video case
1555 if mobj.group(3) is not None:
1556 self._downloader.download([mobj.group(3)])
1557 return
1558
1559 # Download playlist pages
1560 # prefix is 'p' as default for playlists but there are other types that need extra care
1561 playlist_prefix = mobj.group(1)
1562 if playlist_prefix == 'a':
1563 playlist_access = 'artist'
1564 else:
1565 playlist_prefix = 'p'
1566 playlist_access = 'view_play_list'
1567 playlist_id = mobj.group(2)
1568 video_ids = []
1569 pagenum = 1
1570
1571 while True:
1572 self.report_download_page(playlist_id, pagenum)
1573 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1574 request = urllib2.Request(url)
1575 try:
1576 page = urllib2.urlopen(request).read()
1577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1579 return
1580
1581 # Extract video identifiers
1582 ids_in_page = []
1583 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1584 if mobj.group(1) not in ids_in_page:
1585 ids_in_page.append(mobj.group(1))
1586 video_ids.extend(ids_in_page)
1587
1588 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1589 break
1590 pagenum = pagenum + 1
1591
1592 playliststart = self._downloader.params.get('playliststart', 1) - 1
1593 playlistend = self._downloader.params.get('playlistend', -1)
1594 if playlistend == -1:
1595 video_ids = video_ids[playliststart:]
1596 else:
1597 video_ids = video_ids[playliststart:playlistend]
1598
1599 for id in video_ids:
1600 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1601 return
1602
1603
1604 class YoutubeChannelIE(InfoExtractor):
1605 """Information Extractor for YouTube channels."""
1606
1607 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1608 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1609 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1610 IE_NAME = u'youtube:channel'
1611
1612 def report_download_page(self, channel_id, pagenum):
1613 """Report attempt to download channel page with given number."""
1614 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1615
1616 def _real_extract(self, url):
1617 # Extract channel id
1618 mobj = re.match(self._VALID_URL, url)
1619 if mobj is None:
1620 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1621 return
1622
1623 # Download channel pages
1624 channel_id = mobj.group(1)
1625 video_ids = []
1626 pagenum = 1
1627
1628 while True:
1629 self.report_download_page(channel_id, pagenum)
1630 url = self._TEMPLATE_URL % (channel_id, pagenum)
1631 request = urllib2.Request(url)
1632 try:
1633 page = urllib2.urlopen(request).read()
1634 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1636 return
1637
1638 # Extract video identifiers
1639 ids_in_page = []
1640 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1641 if mobj.group(1) not in ids_in_page:
1642 ids_in_page.append(mobj.group(1))
1643 video_ids.extend(ids_in_page)
1644
1645 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1646 break
1647 pagenum = pagenum + 1
1648
1649 for id in video_ids:
1650 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1651 return
1652
1653
1654 class YoutubeUserIE(InfoExtractor):
1655 """Information Extractor for YouTube users."""
1656
1657 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1658 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1659 _GDATA_PAGE_SIZE = 50
1660 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1661 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1662 IE_NAME = u'youtube:user'
1663
1664 def __init__(self, downloader=None):
1665 InfoExtractor.__init__(self, downloader)
1666
1667 def report_download_page(self, username, start_index):
1668 """Report attempt to download user page."""
1669 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1670 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1671
1672 def _real_extract(self, url):
1673 # Extract username
1674 mobj = re.match(self._VALID_URL, url)
1675 if mobj is None:
1676 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1677 return
1678
1679 username = mobj.group(1)
1680
1681 # Download video ids using YouTube Data API. Result size per
1682 # query is limited (currently to 50 videos) so we need to query
1683 # page by page until there are no video ids - it means we got
1684 # all of them.
1685
1686 video_ids = []
1687 pagenum = 0
1688
1689 while True:
1690 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1691 self.report_download_page(username, start_index)
1692
1693 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1694
1695 try:
1696 page = urllib2.urlopen(request).read()
1697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1699 return
1700
1701 # Extract video identifiers
1702 ids_in_page = []
1703
1704 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1705 if mobj.group(1) not in ids_in_page:
1706 ids_in_page.append(mobj.group(1))
1707
1708 video_ids.extend(ids_in_page)
1709
1710 # A little optimization - if current page is not
1711 # "full", ie. does not contain PAGE_SIZE video ids then
1712 # we can assume that this page is the last one - there
1713 # are no more ids on further pages - no need to query
1714 # again.
1715
1716 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1717 break
1718
1719 pagenum += 1
1720
1721 all_ids_count = len(video_ids)
1722 playliststart = self._downloader.params.get('playliststart', 1) - 1
1723 playlistend = self._downloader.params.get('playlistend', -1)
1724
1725 if playlistend == -1:
1726 video_ids = video_ids[playliststart:]
1727 else:
1728 video_ids = video_ids[playliststart:playlistend]
1729
1730 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1731 (username, all_ids_count, len(video_ids)))
1732
1733 for video_id in video_ids:
1734 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1735
1736
1737 class BlipTVUserIE(InfoExtractor):
1738 """Information Extractor for blip.tv users."""
1739
1740 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1741 _PAGE_SIZE = 12
1742 IE_NAME = u'blip.tv:user'
1743
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1746
1747 def report_download_page(self, username, pagenum):
1748 """Report attempt to download user page."""
1749 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1750 (self.IE_NAME, username, pagenum))
1751
1752 def _real_extract(self, url):
1753 # Extract username
1754 mobj = re.match(self._VALID_URL, url)
1755 if mobj is None:
1756 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 return
1758
1759 username = mobj.group(1)
1760
1761 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1762
1763 request = urllib2.Request(url)
1764
1765 try:
1766 page = urllib2.urlopen(request).read().decode('utf-8')
1767 mobj = re.search(r'data-users-id="([^"]+)"', page)
1768 page_base = page_base % mobj.group(1)
1769 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1770 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1771 return
1772
1773
1774 # Download video ids using BlipTV Ajax calls. Result size per
1775 # query is limited (currently to 12 videos) so we need to query
1776 # page by page until there are no video ids - it means we got
1777 # all of them.
1778
1779 video_ids = []
1780 pagenum = 1
1781
1782 while True:
1783 self.report_download_page(username, pagenum)
1784
1785 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1786
1787 try:
1788 page = urllib2.urlopen(request).read().decode('utf-8')
1789 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1790 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1791 return
1792
1793 # Extract video identifiers
1794 ids_in_page = []
1795
1796 for mobj in re.finditer(r'href="/([^"]+)"', page):
1797 if mobj.group(1) not in ids_in_page:
1798 ids_in_page.append(unescapeHTML(mobj.group(1)))
1799
1800 video_ids.extend(ids_in_page)
1801
1802 # A little optimization - if current page is not
1803 # "full", ie. does not contain PAGE_SIZE video ids then
1804 # we can assume that this page is the last one - there
1805 # are no more ids on further pages - no need to query
1806 # again.
1807
1808 if len(ids_in_page) < self._PAGE_SIZE:
1809 break
1810
1811 pagenum += 1
1812
1813 all_ids_count = len(video_ids)
1814 playliststart = self._downloader.params.get('playliststart', 1) - 1
1815 playlistend = self._downloader.params.get('playlistend', -1)
1816
1817 if playlistend == -1:
1818 video_ids = video_ids[playliststart:]
1819 else:
1820 video_ids = video_ids[playliststart:playlistend]
1821
1822 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1823 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1824
1825 for video_id in video_ids:
1826 self._downloader.download([u'http://blip.tv/'+video_id])
1827
1828
1829 class DepositFilesIE(InfoExtractor):
1830 """Information extractor for depositfiles.com"""
1831
1832 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1833 IE_NAME = u'DepositFiles'
1834
1835 def __init__(self, downloader=None):
1836 InfoExtractor.__init__(self, downloader)
1837
1838 def report_download_webpage(self, file_id):
1839 """Report webpage download."""
1840 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1841
1842 def report_extraction(self, file_id):
1843 """Report information extraction."""
1844 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1845
1846 def _real_extract(self, url):
1847 file_id = url.split('/')[-1]
1848 # Rebuild url in english locale
1849 url = 'http://depositfiles.com/en/files/' + file_id
1850
1851 # Retrieve file webpage with 'Free download' button pressed
1852 free_download_indication = { 'gateway_result' : '1' }
1853 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1854 try:
1855 self.report_download_webpage(file_id)
1856 webpage = urllib2.urlopen(request).read()
1857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1859 return
1860
1861 # Search for the real file URL
1862 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1863 if (mobj is None) or (mobj.group(1) is None):
1864 # Try to figure out reason of the error.
1865 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1866 if (mobj is not None) and (mobj.group(1) is not None):
1867 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1868 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1869 else:
1870 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1871 return
1872
1873 file_url = mobj.group(1)
1874 file_extension = os.path.splitext(file_url)[1][1:]
1875
1876 # Search for file title
1877 mobj = re.search(r'<b title="(.*?)">', webpage)
1878 if mobj is None:
1879 self._downloader.trouble(u'ERROR: unable to extract title')
1880 return
1881 file_title = mobj.group(1).decode('utf-8')
1882
1883 return [{
1884 'id': file_id.decode('utf-8'),
1885 'url': file_url.decode('utf-8'),
1886 'uploader': u'NA',
1887 'upload_date': u'NA',
1888 'title': file_title,
1889 'ext': file_extension.decode('utf-8'),
1890 'player_url': None,
1891 }]
1892
1893
1894 class FacebookIE(InfoExtractor):
1895 """Information Extractor for Facebook"""
1896
1897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1898 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1899 _NETRC_MACHINE = 'facebook'
1900 _available_formats = ['video', 'highqual', 'lowqual']
1901 _video_extensions = {
1902 'video': 'mp4',
1903 'highqual': 'mp4',
1904 'lowqual': 'mp4',
1905 }
1906 IE_NAME = u'facebook'
1907
1908 def __init__(self, downloader=None):
1909 InfoExtractor.__init__(self, downloader)
1910
1911 def _reporter(self, message):
1912 """Add header and report message."""
1913 self._downloader.to_screen(u'[facebook] %s' % message)
1914
1915 def report_login(self):
1916 """Report attempt to log in."""
1917 self._reporter(u'Logging in')
1918
1919 def report_video_webpage_download(self, video_id):
1920 """Report attempt to download video webpage."""
1921 self._reporter(u'%s: Downloading video webpage' % video_id)
1922
1923 def report_information_extraction(self, video_id):
1924 """Report attempt to extract video information."""
1925 self._reporter(u'%s: Extracting video information' % video_id)
1926
1927 def _parse_page(self, video_webpage):
1928 """Extract video information from page"""
1929 # General data
1930 data = {'title': r'\("video_title", "(.*?)"\)',
1931 'description': r'<div class="datawrap">(.*?)</div>',
1932 'owner': r'\("video_owner_name", "(.*?)"\)',
1933 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1934 }
1935 video_info = {}
1936 for piece in data.keys():
1937 mobj = re.search(data[piece], video_webpage)
1938 if mobj is not None:
1939 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1940
1941 # Video urls
1942 video_urls = {}
1943 for fmt in self._available_formats:
1944 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1945 if mobj is not None:
1946 # URL is in a Javascript segment inside an escaped Unicode format within
1947 # the generally utf-8 page
1948 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1949 video_info['video_urls'] = video_urls
1950
1951 return video_info
1952
1953 def _real_initialize(self):
1954 if self._downloader is None:
1955 return
1956
1957 useremail = None
1958 password = None
1959 downloader_params = self._downloader.params
1960
1961 # Attempt to use provided username and password or .netrc data
1962 if downloader_params.get('username', None) is not None:
1963 useremail = downloader_params['username']
1964 password = downloader_params['password']
1965 elif downloader_params.get('usenetrc', False):
1966 try:
1967 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1968 if info is not None:
1969 useremail = info[0]
1970 password = info[2]
1971 else:
1972 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1973 except (IOError, netrc.NetrcParseError), err:
1974 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1975 return
1976
1977 if useremail is None:
1978 return
1979
1980 # Log in
1981 login_form = {
1982 'email': useremail,
1983 'pass': password,
1984 'login': 'Log+In'
1985 }
1986 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1987 try:
1988 self.report_login()
1989 login_results = urllib2.urlopen(request).read()
1990 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1991 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1992 return
1993 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1994 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1995 return
1996
1997 def _real_extract(self, url):
1998 mobj = re.match(self._VALID_URL, url)
1999 if mobj is None:
2000 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2001 return
2002 video_id = mobj.group('ID')
2003
2004 # Get video webpage
2005 self.report_video_webpage_download(video_id)
2006 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2007 try:
2008 page = urllib2.urlopen(request)
2009 video_webpage = page.read()
2010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2011 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2012 return
2013
2014 # Start extracting information
2015 self.report_information_extraction(video_id)
2016
2017 # Extract information
2018 video_info = self._parse_page(video_webpage)
2019
2020 # uploader
2021 if 'owner' not in video_info:
2022 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2023 return
2024 video_uploader = video_info['owner']
2025
2026 # title
2027 if 'title' not in video_info:
2028 self._downloader.trouble(u'ERROR: unable to extract video title')
2029 return
2030 video_title = video_info['title']
2031 video_title = video_title.decode('utf-8')
2032
2033 # thumbnail image
2034 if 'thumbnail' not in video_info:
2035 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2036 video_thumbnail = ''
2037 else:
2038 video_thumbnail = video_info['thumbnail']
2039
2040 # upload date
2041 upload_date = u'NA'
2042 if 'upload_date' in video_info:
2043 upload_time = video_info['upload_date']
2044 timetuple = email.utils.parsedate_tz(upload_time)
2045 if timetuple is not None:
2046 try:
2047 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2048 except:
2049 pass
2050
2051 # description
2052 video_description = video_info.get('description', 'No description available.')
2053
2054 url_map = video_info['video_urls']
2055 if len(url_map.keys()) > 0:
2056 # Decide which formats to download
2057 req_format = self._downloader.params.get('format', None)
2058 format_limit = self._downloader.params.get('format_limit', None)
2059
2060 if format_limit is not None and format_limit in self._available_formats:
2061 format_list = self._available_formats[self._available_formats.index(format_limit):]
2062 else:
2063 format_list = self._available_formats
2064 existing_formats = [x for x in format_list if x in url_map]
2065 if len(existing_formats) == 0:
2066 self._downloader.trouble(u'ERROR: no known formats available for video')
2067 return
2068 if req_format is None:
2069 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2070 elif req_format == 'worst':
2071 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2072 elif req_format == '-1':
2073 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2074 else:
2075 # Specific format
2076 if req_format not in url_map:
2077 self._downloader.trouble(u'ERROR: requested format not available')
2078 return
2079 video_url_list = [(req_format, url_map[req_format])] # Specific format
2080
2081 results = []
2082 for format_param, video_real_url in video_url_list:
2083 # Extension
2084 video_extension = self._video_extensions.get(format_param, 'mp4')
2085
2086 results.append({
2087 'id': video_id.decode('utf-8'),
2088 'url': video_real_url.decode('utf-8'),
2089 'uploader': video_uploader.decode('utf-8'),
2090 'upload_date': upload_date,
2091 'title': video_title,
2092 'ext': video_extension.decode('utf-8'),
2093 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2094 'thumbnail': video_thumbnail.decode('utf-8'),
2095 'description': video_description.decode('utf-8'),
2096 'player_url': None,
2097 })
2098 return results
2099
2100 class BlipTVIE(InfoExtractor):
2101 """Information extractor for blip.tv"""
2102
2103 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2104 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2105 IE_NAME = u'blip.tv'
2106
2107 def report_extraction(self, file_id):
2108 """Report information extraction."""
2109 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2110
2111 def report_direct_download(self, title):
2112 """Report information extraction."""
2113 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2114
2115 def _real_extract(self, url):
2116 mobj = re.match(self._VALID_URL, url)
2117 if mobj is None:
2118 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2119 return
2120
2121 if '?' in url:
2122 cchar = '&'
2123 else:
2124 cchar = '?'
2125 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2126 request = urllib2.Request(json_url.encode('utf-8'))
2127 self.report_extraction(mobj.group(1))
2128 info = None
2129 try:
2130 urlh = urllib2.urlopen(request)
2131 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2132 basename = url.split('/')[-1]
2133 title,ext = os.path.splitext(basename)
2134 title = title.decode('UTF-8')
2135 ext = ext.replace('.', '')
2136 self.report_direct_download(title)
2137 info = {
2138 'id': title,
2139 'url': url,
2140 'title': title,
2141 'ext': ext,
2142 'urlhandle': urlh
2143 }
2144 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2145 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2146 return
2147 if info is None: # Regular URL
2148 try:
2149 json_code = urlh.read()
2150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2151 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2152 return
2153
2154 try:
2155 json_data = json.loads(json_code)
2156 if 'Post' in json_data:
2157 data = json_data['Post']
2158 else:
2159 data = json_data
2160
2161 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2162 video_url = data['media']['url']
2163 umobj = re.match(self._URL_EXT, video_url)
2164 if umobj is None:
2165 raise ValueError('Can not determine filename extension')
2166 ext = umobj.group(1)
2167
2168 info = {
2169 'id': data['item_id'],
2170 'url': video_url,
2171 'uploader': data['display_name'],
2172 'upload_date': upload_date,
2173 'title': data['title'],
2174 'ext': ext,
2175 'format': data['media']['mimeType'],
2176 'thumbnail': data['thumbnailUrl'],
2177 'description': data['description'],
2178 'player_url': data['embedUrl']
2179 }
2180 except (ValueError,KeyError), err:
2181 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2182 return
2183
2184 std_headers['User-Agent'] = 'iTunes/10.6.1'
2185 return [info]
2186
2187
2188 class MyVideoIE(InfoExtractor):
2189 """Information Extractor for myvideo.de."""
2190
2191 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2192 IE_NAME = u'myvideo'
2193
2194 def __init__(self, downloader=None):
2195 InfoExtractor.__init__(self, downloader)
2196
2197 def report_download_webpage(self, video_id):
2198 """Report webpage download."""
2199 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2200
2201 def report_extraction(self, video_id):
2202 """Report information extraction."""
2203 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2204
2205 def _real_extract(self,url):
2206 mobj = re.match(self._VALID_URL, url)
2207 if mobj is None:
2208 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2209 return
2210
2211 video_id = mobj.group(1)
2212
2213 # Get video webpage
2214 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2215 try:
2216 self.report_download_webpage(video_id)
2217 webpage = urllib2.urlopen(request).read()
2218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2219 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2220 return
2221
2222 self.report_extraction(video_id)
2223 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2224 webpage)
2225 if mobj is None:
2226 self._downloader.trouble(u'ERROR: unable to extract media URL')
2227 return
2228 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2229
2230 mobj = re.search('<title>([^<]+)</title>', webpage)
2231 if mobj is None:
2232 self._downloader.trouble(u'ERROR: unable to extract title')
2233 return
2234
2235 video_title = mobj.group(1)
2236
2237 return [{
2238 'id': video_id,
2239 'url': video_url,
2240 'uploader': u'NA',
2241 'upload_date': u'NA',
2242 'title': video_title,
2243 'ext': u'flv',
2244 'player_url': None,
2245 }]
2246
2247 class ComedyCentralIE(InfoExtractor):
2248 """Information extractor for The Daily Show and Colbert Report """
2249
2250 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2251 IE_NAME = u'comedycentral'
2252
2253 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2254
2255 _video_extensions = {
2256 '3500': 'mp4',
2257 '2200': 'mp4',
2258 '1700': 'mp4',
2259 '1200': 'mp4',
2260 '750': 'mp4',
2261 '400': 'mp4',
2262 }
2263 _video_dimensions = {
2264 '3500': '1280x720',
2265 '2200': '960x540',
2266 '1700': '768x432',
2267 '1200': '640x360',
2268 '750': '512x288',
2269 '400': '384x216',
2270 }
2271
2272 def report_extraction(self, episode_id):
2273 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2274
2275 def report_config_download(self, episode_id):
2276 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2277
2278 def report_index_download(self, episode_id):
2279 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2280
2281 def report_player_url(self, episode_id):
2282 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2283
2284
2285 def _print_formats(self, formats):
2286 print('Available formats:')
2287 for x in formats:
2288 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2289
2290
2291 def _real_extract(self, url):
2292 mobj = re.match(self._VALID_URL, url)
2293 if mobj is None:
2294 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2295 return
2296
2297 if mobj.group('shortname'):
2298 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2299 url = u'http://www.thedailyshow.com/full-episodes/'
2300 else:
2301 url = u'http://www.colbertnation.com/full-episodes/'
2302 mobj = re.match(self._VALID_URL, url)
2303 assert mobj is not None
2304
2305 dlNewest = not mobj.group('episode')
2306 if dlNewest:
2307 epTitle = mobj.group('showname')
2308 else:
2309 epTitle = mobj.group('episode')
2310
2311 req = urllib2.Request(url)
2312 self.report_extraction(epTitle)
2313 try:
2314 htmlHandle = urllib2.urlopen(req)
2315 html = htmlHandle.read()
2316 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2317 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2318 return
2319 if dlNewest:
2320 url = htmlHandle.geturl()
2321 mobj = re.match(self._VALID_URL, url)
2322 if mobj is None:
2323 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2324 return
2325 if mobj.group('episode') == '':
2326 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2327 return
2328 epTitle = mobj.group('episode')
2329
2330 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2331
2332 if len(mMovieParams) == 0:
2333 # The Colbert Report embeds the information in a without
2334 # a URL prefix; so extract the alternate reference
2335 # and then add the URL prefix manually.
2336
2337 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2338 if len(altMovieParams) == 0:
2339 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2340 return
2341 else:
2342 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2343
2344 playerUrl_raw = mMovieParams[0][0]
2345 self.report_player_url(epTitle)
2346 try:
2347 urlHandle = urllib2.urlopen(playerUrl_raw)
2348 playerUrl = urlHandle.geturl()
2349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2350 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2351 return
2352
2353 uri = mMovieParams[0][1]
2354 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2355 self.report_index_download(epTitle)
2356 try:
2357 indexXml = urllib2.urlopen(indexUrl).read()
2358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2360 return
2361
2362 results = []
2363
2364 idoc = xml.etree.ElementTree.fromstring(indexXml)
2365 itemEls = idoc.findall('.//item')
2366 for itemEl in itemEls:
2367 mediaId = itemEl.findall('./guid')[0].text
2368 shortMediaId = mediaId.split(':')[-1]
2369 showId = mediaId.split(':')[-2].replace('.com', '')
2370 officialTitle = itemEl.findall('./title')[0].text
2371 officialDate = itemEl.findall('./pubDate')[0].text
2372
2373 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2374 urllib.urlencode({'uri': mediaId}))
2375 configReq = urllib2.Request(configUrl)
2376 self.report_config_download(epTitle)
2377 try:
2378 configXml = urllib2.urlopen(configReq).read()
2379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2380 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2381 return
2382
2383 cdoc = xml.etree.ElementTree.fromstring(configXml)
2384 turls = []
2385 for rendition in cdoc.findall('.//rendition'):
2386 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2387 turls.append(finfo)
2388
2389 if len(turls) == 0:
2390 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2391 continue
2392
2393 if self._downloader.params.get('listformats', None):
2394 self._print_formats([i[0] for i in turls])
2395 return
2396
2397 # For now, just pick the highest bitrate
2398 format,video_url = turls[-1]
2399
2400 # Get the format arg from the arg stream
2401 req_format = self._downloader.params.get('format', None)
2402
2403 # Select format if we can find one
2404 for f,v in turls:
2405 if f == req_format:
2406 format, video_url = f, v
2407 break
2408
2409 # Patch to download from alternative CDN, which does not
2410 # break on current RTMPDump builds
2411 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2412 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2413
2414 if video_url.startswith(broken_cdn):
2415 video_url = video_url.replace(broken_cdn, better_cdn)
2416
2417 effTitle = showId + u'-' + epTitle
2418 info = {
2419 'id': shortMediaId,
2420 'url': video_url,
2421 'uploader': showId,
2422 'upload_date': officialDate,
2423 'title': effTitle,
2424 'ext': 'mp4',
2425 'format': format,
2426 'thumbnail': None,
2427 'description': officialTitle,
2428 'player_url': None #playerUrl
2429 }
2430
2431 results.append(info)
2432
2433 return results
2434
2435
2436 class EscapistIE(InfoExtractor):
2437 """Information extractor for The Escapist """
2438
2439 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2440 IE_NAME = u'escapist'
2441
2442 def report_extraction(self, showName):
2443 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2444
2445 def report_config_download(self, showName):
2446 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2447
2448 def _real_extract(self, url):
2449 mobj = re.match(self._VALID_URL, url)
2450 if mobj is None:
2451 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452 return
2453 showName = mobj.group('showname')
2454 videoId = mobj.group('episode')
2455
2456 self.report_extraction(showName)
2457 try:
2458 webPage = urllib2.urlopen(url)
2459 webPageBytes = webPage.read()
2460 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2461 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2463 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2464 return
2465
2466 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2467 description = unescapeHTML(descMatch.group(1))
2468 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2469 imgUrl = unescapeHTML(imgMatch.group(1))
2470 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2471 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2472 configUrlMatch = re.search('config=(.*)$', playerUrl)
2473 configUrl = urllib2.unquote(configUrlMatch.group(1))
2474
2475 self.report_config_download(showName)
2476 try:
2477 configJSON = urllib2.urlopen(configUrl).read()
2478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2480 return
2481
2482 # Technically, it's JavaScript, not JSON
2483 configJSON = configJSON.replace("'", '"')
2484
2485 try:
2486 config = json.loads(configJSON)
2487 except (ValueError,), err:
2488 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2489 return
2490
2491 playlist = config['playlist']
2492 videoUrl = playlist[1]['url']
2493
2494 info = {
2495 'id': videoId,
2496 'url': videoUrl,
2497 'uploader': showName,
2498 'upload_date': None,
2499 'title': showName,
2500 'ext': 'flv',
2501 'thumbnail': imgUrl,
2502 'description': description,
2503 'player_url': playerUrl,
2504 }
2505
2506 return [info]
2507
2508
2509 class CollegeHumorIE(InfoExtractor):
2510 """Information extractor for collegehumor.com"""
2511
2512 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2513 IE_NAME = u'collegehumor'
2514
2515 def report_webpage(self, video_id):
2516 """Report information extraction."""
2517 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2518
2519 def report_extraction(self, video_id):
2520 """Report information extraction."""
2521 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2522
2523 def _real_extract(self, url):
2524 mobj = re.match(self._VALID_URL, url)
2525 if mobj is None:
2526 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2527 return
2528 video_id = mobj.group('videoid')
2529
2530 self.report_webpage(video_id)
2531 request = urllib2.Request(url)
2532 try:
2533 webpage = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2536 return
2537
2538 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2539 if m is None:
2540 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2541 return
2542 internal_video_id = m.group('internalvideoid')
2543
2544 info = {
2545 'id': video_id,
2546 'internal_id': internal_video_id,
2547 }
2548
2549 self.report_extraction(video_id)
2550 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2551 try:
2552 metaXml = urllib2.urlopen(xmlUrl).read()
2553 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2554 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2555 return
2556
2557 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2558 try:
2559 videoNode = mdoc.findall('./video')[0]
2560 info['description'] = videoNode.findall('./description')[0].text
2561 info['title'] = videoNode.findall('./caption')[0].text
2562 info['url'] = videoNode.findall('./file')[0].text
2563 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2564 info['ext'] = info['url'].rpartition('.')[2]
2565 except IndexError:
2566 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2567 return
2568
2569 return [info]
2570
2571
2572 class XVideosIE(InfoExtractor):
2573 """Information extractor for xvideos.com"""
2574
2575 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2576 IE_NAME = u'xvideos'
2577
2578 def report_webpage(self, video_id):
2579 """Report information extraction."""
2580 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2581
2582 def report_extraction(self, video_id):
2583 """Report information extraction."""
2584 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2585
2586 def _real_extract(self, url):
2587 mobj = re.match(self._VALID_URL, url)
2588 if mobj is None:
2589 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2590 return
2591 video_id = mobj.group(1).decode('utf-8')
2592
2593 self.report_webpage(video_id)
2594
2595 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2596 try:
2597 webpage = urllib2.urlopen(request).read()
2598 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2599 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2600 return
2601
2602 self.report_extraction(video_id)
2603
2604
2605 # Extract video URL
2606 mobj = re.search(r'flv_url=(.+?)&', webpage)
2607 if mobj is None:
2608 self._downloader.trouble(u'ERROR: unable to extract video url')
2609 return
2610 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2611
2612
2613 # Extract title
2614 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2615 if mobj is None:
2616 self._downloader.trouble(u'ERROR: unable to extract video title')
2617 return
2618 video_title = mobj.group(1).decode('utf-8')
2619
2620
2621 # Extract video thumbnail
2622 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2623 if mobj is None:
2624 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2625 return
2626 video_thumbnail = mobj.group(0).decode('utf-8')
2627
2628 info = {
2629 'id': video_id,
2630 'url': video_url,
2631 'uploader': None,
2632 'upload_date': None,
2633 'title': video_title,
2634 'ext': 'flv',
2635 'thumbnail': video_thumbnail,
2636 'description': None,
2637 'player_url': None,
2638 }
2639
2640 return [info]
2641
2642
2643 class SoundcloudIE(InfoExtractor):
2644 """Information extractor for soundcloud.com
2645 To access the media, the uid of the song and a stream token
2646 must be extracted from the page source and the script must make
2647 a request to media.soundcloud.com/crossdomain.xml. Then
2648 the media can be grabbed by requesting from an url composed
2649 of the stream token and uid
2650 """
2651
2652 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2653 IE_NAME = u'soundcloud'
2654
2655 def __init__(self, downloader=None):
2656 InfoExtractor.__init__(self, downloader)
2657
2658 def report_webpage(self, video_id):
2659 """Report information extraction."""
2660 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2661
2662 def report_extraction(self, video_id):
2663 """Report information extraction."""
2664 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2665
2666 def _real_extract(self, url):
2667 mobj = re.match(self._VALID_URL, url)
2668 if mobj is None:
2669 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2670 return
2671
2672 # extract uploader (which is in the url)
2673 uploader = mobj.group(1).decode('utf-8')
2674 # extract simple title (uploader + slug of song title)
2675 slug_title = mobj.group(2).decode('utf-8')
2676 simple_title = uploader + u'-' + slug_title
2677
2678 self.report_webpage('%s/%s' % (uploader, slug_title))
2679
2680 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2681 try:
2682 webpage = urllib2.urlopen(request).read()
2683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2684 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2685 return
2686
2687 self.report_extraction('%s/%s' % (uploader, slug_title))
2688
2689 # extract uid and stream token that soundcloud hands out for access
2690 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2691 if mobj:
2692 video_id = mobj.group(1)
2693 stream_token = mobj.group(2)
2694
2695 # extract unsimplified title
2696 mobj = re.search('"title":"(.*?)",', webpage)
2697 if mobj:
2698 title = mobj.group(1).decode('utf-8')
2699 else:
2700 title = simple_title
2701
2702 # construct media url (with uid/token)
2703 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2704 mediaURL = mediaURL % (video_id, stream_token)
2705
2706 # description
2707 description = u'No description available'
2708 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2709 if mobj:
2710 description = mobj.group(1)
2711
2712 # upload date
2713 upload_date = None
2714 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2715 if mobj:
2716 try:
2717 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2718 except Exception, e:
2719 self._downloader.to_stderr(compat_str(e))
2720
2721 # for soundcloud, a request to a cross domain is required for cookies
2722 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2723
2724 return [{
2725 'id': video_id.decode('utf-8'),
2726 'url': mediaURL,
2727 'uploader': uploader.decode('utf-8'),
2728 'upload_date': upload_date,
2729 'title': title,
2730 'ext': u'mp3',
2731 'player_url': None,
2732 'description': description.decode('utf-8')
2733 }]
2734
2735
2736 class InfoQIE(InfoExtractor):
2737 """Information extractor for infoq.com"""
2738
2739 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2740 IE_NAME = u'infoq'
2741
2742 def report_webpage(self, video_id):
2743 """Report information extraction."""
2744 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2745
2746 def report_extraction(self, video_id):
2747 """Report information extraction."""
2748 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2749
2750 def _real_extract(self, url):
2751 mobj = re.match(self._VALID_URL, url)
2752 if mobj is None:
2753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2754 return
2755
2756 self.report_webpage(url)
2757
2758 request = urllib2.Request(url)
2759 try:
2760 webpage = urllib2.urlopen(request).read()
2761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2763 return
2764
2765 self.report_extraction(url)
2766
2767
2768 # Extract video URL
2769 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2770 if mobj is None:
2771 self._downloader.trouble(u'ERROR: unable to extract video url')
2772 return
2773 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2774
2775
2776 # Extract title
2777 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2778 if mobj is None:
2779 self._downloader.trouble(u'ERROR: unable to extract video title')
2780 return
2781 video_title = mobj.group(1).decode('utf-8')
2782
2783 # Extract description
2784 video_description = u'No description available.'
2785 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2786 if mobj is not None:
2787 video_description = mobj.group(1).decode('utf-8')
2788
2789 video_filename = video_url.split('/')[-1]
2790 video_id, extension = video_filename.split('.')
2791
2792 info = {
2793 'id': video_id,
2794 'url': video_url,
2795 'uploader': None,
2796 'upload_date': None,
2797 'title': video_title,
2798 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2799 'thumbnail': None,
2800 'description': video_description,
2801 'player_url': None,
2802 }
2803
2804 return [info]
2805
2806 class MixcloudIE(InfoExtractor):
2807 """Information extractor for www.mixcloud.com"""
2808 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2809 IE_NAME = u'mixcloud'
2810
2811 def __init__(self, downloader=None):
2812 InfoExtractor.__init__(self, downloader)
2813
2814 def report_download_json(self, file_id):
2815 """Report JSON download."""
2816 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2817
2818 def report_extraction(self, file_id):
2819 """Report information extraction."""
2820 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2821
2822 def get_urls(self, jsonData, fmt, bitrate='best'):
2823 """Get urls from 'audio_formats' section in json"""
2824 file_url = None
2825 try:
2826 bitrate_list = jsonData[fmt]
2827 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2828 bitrate = max(bitrate_list) # select highest
2829
2830 url_list = jsonData[fmt][bitrate]
2831 except TypeError: # we have no bitrate info.
2832 url_list = jsonData[fmt]
2833 return url_list
2834
2835 def check_urls(self, url_list):
2836 """Returns 1st active url from list"""
2837 for url in url_list:
2838 try:
2839 urllib2.urlopen(url)
2840 return url
2841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2842 url = None
2843
2844 return None
2845
2846 def _print_formats(self, formats):
2847 print('Available formats:')
2848 for fmt in formats.keys():
2849 for b in formats[fmt]:
2850 try:
2851 ext = formats[fmt][b][0]
2852 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2853 except TypeError: # we have no bitrate info
2854 ext = formats[fmt][0]
2855 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2856 break
2857
2858 def _real_extract(self, url):
2859 mobj = re.match(self._VALID_URL, url)
2860 if mobj is None:
2861 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2862 return
2863 # extract uploader & filename from url
2864 uploader = mobj.group(1).decode('utf-8')
2865 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2866
2867 # construct API request
2868 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2869 # retrieve .json file with links to files
2870 request = urllib2.Request(file_url)
2871 try:
2872 self.report_download_json(file_url)
2873 jsonData = urllib2.urlopen(request).read()
2874 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2875 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2876 return
2877
2878 # parse JSON
2879 json_data = json.loads(jsonData)
2880 player_url = json_data['player_swf_url']
2881 formats = dict(json_data['audio_formats'])
2882
2883 req_format = self._downloader.params.get('format', None)
2884 bitrate = None
2885
2886 if self._downloader.params.get('listformats', None):
2887 self._print_formats(formats)
2888 return
2889
2890 if req_format is None or req_format == 'best':
2891 for format_param in formats.keys():
2892 url_list = self.get_urls(formats, format_param)
2893 # check urls
2894 file_url = self.check_urls(url_list)
2895 if file_url is not None:
2896 break # got it!
2897 else:
2898 if req_format not in formats.keys():
2899 self._downloader.trouble(u'ERROR: format is not available')
2900 return
2901
2902 url_list = self.get_urls(formats, req_format)
2903 file_url = self.check_urls(url_list)
2904 format_param = req_format
2905
2906 return [{
2907 'id': file_id.decode('utf-8'),
2908 'url': file_url.decode('utf-8'),
2909 'uploader': uploader.decode('utf-8'),
2910 'upload_date': u'NA',
2911 'title': json_data['name'],
2912 'ext': file_url.split('.')[-1].decode('utf-8'),
2913 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2914 'thumbnail': json_data['thumbnail_url'],
2915 'description': json_data['description'],
2916 'player_url': player_url.decode('utf-8'),
2917 }]
2918
2919 class StanfordOpenClassroomIE(InfoExtractor):
2920 """Information extractor for Stanford's Open ClassRoom"""
2921
2922 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2923 IE_NAME = u'stanfordoc'
2924
2925 def report_download_webpage(self, objid):
2926 """Report information extraction."""
2927 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2928
2929 def report_extraction(self, video_id):
2930 """Report information extraction."""
2931 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2932
2933 def _real_extract(self, url):
2934 mobj = re.match(self._VALID_URL, url)
2935 if mobj is None:
2936 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2937 return
2938
2939 if mobj.group('course') and mobj.group('video'): # A specific video
2940 course = mobj.group('course')
2941 video = mobj.group('video')
2942 info = {
2943 'id': course + '_' + video,
2944 }
2945
2946 self.report_extraction(info['id'])
2947 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2948 xmlUrl = baseUrl + video + '.xml'
2949 try:
2950 metaXml = urllib2.urlopen(xmlUrl).read()
2951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2952 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2953 return
2954 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2955 try:
2956 info['title'] = mdoc.findall('./title')[0].text
2957 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2958 except IndexError:
2959 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2960 return
2961 info['ext'] = info['url'].rpartition('.')[2]
2962 return [info]
2963 elif mobj.group('course'): # A course page
2964 course = mobj.group('course')
2965 info = {
2966 'id': course,
2967 'type': 'playlist',
2968 }
2969
2970 self.report_download_webpage(info['id'])
2971 try:
2972 coursepage = urllib2.urlopen(url).read()
2973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2975 return
2976
2977 m = re.search('<h1>([^<]+)</h1>', coursepage)
2978 if m:
2979 info['title'] = unescapeHTML(m.group(1))
2980 else:
2981 info['title'] = info['id']
2982
2983 m = re.search('<description>([^<]+)</description>', coursepage)
2984 if m:
2985 info['description'] = unescapeHTML(m.group(1))
2986
2987 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2988 info['list'] = [
2989 {
2990 'type': 'reference',
2991 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2992 }
2993 for vpage in links]
2994 results = []
2995 for entry in info['list']:
2996 assert entry['type'] == 'reference'
2997 results += self.extract(entry['url'])
2998 return results
2999
3000 else: # Root page
3001 info = {
3002 'id': 'Stanford OpenClassroom',
3003 'type': 'playlist',
3004 }
3005
3006 self.report_download_webpage(info['id'])
3007 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3008 try:
3009 rootpage = urllib2.urlopen(rootURL).read()
3010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3011 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3012 return
3013
3014 info['title'] = info['id']
3015
3016 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3017 info['list'] = [
3018 {
3019 'type': 'reference',
3020 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3021 }
3022 for cpage in links]
3023
3024 results = []
3025 for entry in info['list']:
3026 assert entry['type'] == 'reference'
3027 results += self.extract(entry['url'])
3028 return results
3029
3030 class MTVIE(InfoExtractor):
3031 """Information extractor for MTV.com"""
3032
3033 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3034 IE_NAME = u'mtv'
3035
3036 def report_webpage(self, video_id):
3037 """Report information extraction."""
3038 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3039
3040 def report_extraction(self, video_id):
3041 """Report information extraction."""
3042 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3043
3044 def _real_extract(self, url):
3045 mobj = re.match(self._VALID_URL, url)
3046 if mobj is None:
3047 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3048 return
3049 if not mobj.group('proto'):
3050 url = 'http://' + url
3051 video_id = mobj.group('videoid')
3052 self.report_webpage(video_id)
3053
3054 request = urllib2.Request(url)
3055 try:
3056 webpage = urllib2.urlopen(request).read()
3057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3058 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3059 return
3060
3061 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062 if mobj is None:
3063 self._downloader.trouble(u'ERROR: unable to extract song name')
3064 return
3065 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067 if mobj is None:
3068 self._downloader.trouble(u'ERROR: unable to extract performer')
3069 return
3070 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071 video_title = performer + ' - ' + song_name
3072
3073 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074 if mobj is None:
3075 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3076 return
3077 mtvn_uri = mobj.group(1)
3078
3079 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080 if mobj is None:
3081 self._downloader.trouble(u'ERROR: unable to extract content id')
3082 return
3083 content_id = mobj.group(1)
3084
3085 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086 self.report_extraction(video_id)
3087 request = urllib2.Request(videogen_url)
3088 try:
3089 metadataXml = urllib2.urlopen(request).read()
3090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3092 return
3093
3094 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095 renditions = mdoc.findall('.//rendition')
3096
3097 # For now, always pick the highest quality.
3098 rendition = renditions[-1]
3099
3100 try:
3101 _,_,ext = rendition.attrib['type'].partition('/')
3102 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103 video_url = rendition.find('./src').text
3104 except KeyError:
3105 self._downloader.trouble('Invalid rendition field.')
3106 return
3107
3108 info = {
3109 'id': video_id,
3110 'url': video_url,
3111 'uploader': performer,
3112 'title': video_title,
3113 'ext': ext,
3114 'format': format,
3115 }
3116
3117 return [info]
3118
3119
3120 class YoukuIE(InfoExtractor):
3121
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123 IE_NAME = u'Youku'
3124
3125 def __init__(self, downloader=None):
3126 InfoExtractor.__init__(self, downloader)
3127
3128 def report_download_webpage(self, file_id):
3129 """Report webpage download."""
3130 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3131
3132 def report_extraction(self, file_id):
3133 """Report information extraction."""
3134 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3135
3136 def _gen_sid(self):
3137 nowTime = int(time.time() * 1000)
3138 random1 = random.randint(1000,1998)
3139 random2 = random.randint(1000,9999)
3140
3141 return "%d%d%d" %(nowTime,random1,random2)
3142
3143 def _get_file_ID_mix_string(self, seed):
3144 mixed = []
3145 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3146 seed = float(seed)
3147 for i in range(len(source)):
3148 seed = (seed * 211 + 30031 ) % 65536
3149 index = math.floor(seed / 65536 * len(source) )
3150 mixed.append(source[int(index)])
3151 source.remove(source[int(index)])
3152 #return ''.join(mixed)
3153 return mixed
3154
3155 def _get_file_id(self, fileId, seed):
3156 mixed = self._get_file_ID_mix_string(seed)
3157 ids = fileId.split('*')
3158 realId = []
3159 for ch in ids:
3160 if ch:
3161 realId.append(mixed[int(ch)])
3162 return ''.join(realId)
3163
3164 def _real_extract(self, url):
3165 mobj = re.match(self._VALID_URL, url)
3166 if mobj is None:
3167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3168 return
3169 video_id = mobj.group('ID')
3170
3171 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3172
3173 request = urllib2.Request(info_url, None, std_headers)
3174 try:
3175 self.report_download_webpage(video_id)
3176 jsondata = urllib2.urlopen(request).read()
3177 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3178 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3179 return
3180
3181 self.report_extraction(video_id)
3182 try:
3183 config = json.loads(jsondata)
3184
3185 video_title = config['data'][0]['title']
3186 seed = config['data'][0]['seed']
3187
3188 format = self._downloader.params.get('format', None)
3189 supported_format = config['data'][0]['streamfileids'].keys()
3190
3191 if format is None or format == 'best':
3192 if 'hd2' in supported_format:
3193 format = 'hd2'
3194 else:
3195 format = 'flv'
3196 ext = u'flv'
3197 elif format == 'worst':
3198 format = 'mp4'
3199 ext = u'mp4'
3200 else:
3201 format = 'flv'
3202 ext = u'flv'
3203
3204
3205 fileid = config['data'][0]['streamfileids'][format]
3206 seg_number = len(config['data'][0]['segs'][format])
3207
3208 keys=[]
3209 for i in xrange(seg_number):
3210 keys.append(config['data'][0]['segs'][format][i]['k'])
3211
3212 #TODO check error
3213 #youku only could be viewed from mainland china
3214 except:
3215 self._downloader.trouble(u'ERROR: unable to extract info section')
3216 return
3217
3218 files_info=[]
3219 sid = self._gen_sid()
3220 fileid = self._get_file_id(fileid, seed)
3221
3222 #column 8,9 of fileid represent the segment number
3223 #fileid[7:9] should be changed
3224 for index, key in enumerate(keys):
3225
3226 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3228
3229 info = {
3230 'id': '%s_part%02d' % (video_id, index),
3231 'url': download_url,
3232 'uploader': None,
3233 'title': video_title,
3234 'ext': ext,
3235 }
3236 files_info.append(info)
3237
3238 return files_info
3239
3240
3241 class XNXXIE(InfoExtractor):
3242 """Information extractor for xnxx.com"""
3243
3244 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3245 IE_NAME = u'xnxx'
3246 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3247 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3248 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3249
3250 def report_webpage(self, video_id):
3251 """Report information extraction"""
3252 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3253
3254 def report_extraction(self, video_id):
3255 """Report information extraction"""
3256 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3257
3258 def _real_extract(self, url):
3259 mobj = re.match(self._VALID_URL, url)
3260 if mobj is None:
3261 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3262 return
3263 video_id = mobj.group(1).decode('utf-8')
3264
3265 self.report_webpage(video_id)
3266
3267 # Get webpage content
3268 try:
3269 webpage = urllib2.urlopen(url).read()
3270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3271 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3272 return
3273
3274 result = re.search(self.VIDEO_URL_RE, webpage)
3275 if result is None:
3276 self._downloader.trouble(u'ERROR: unable to extract video url')
3277 return
3278 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3279
3280 result = re.search(self.VIDEO_TITLE_RE, webpage)
3281 if result is None:
3282 self._downloader.trouble(u'ERROR: unable to extract video title')
3283 return
3284 video_title = result.group(1).decode('utf-8')
3285
3286 result = re.search(self.VIDEO_THUMB_RE, webpage)
3287 if result is None:
3288 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3289 return
3290 video_thumbnail = result.group(1).decode('utf-8')
3291
3292 info = {'id': video_id,
3293 'url': video_url,
3294 'uploader': None,
3295 'upload_date': None,
3296 'title': video_title,
3297 'ext': 'flv',
3298 'thumbnail': video_thumbnail,
3299 'description': None,
3300 'player_url': None}
3301
3302 return [info]
3303
3304
3305 class GooglePlusIE(InfoExtractor):
3306 """Information extractor for plus.google.com."""
3307
3308 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3309 IE_NAME = u'plus.google'
3310
3311 def __init__(self, downloader=None):
3312 InfoExtractor.__init__(self, downloader)
3313
3314 def report_extract_entry(self, url):
3315 """Report downloading extry"""
3316 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3317
3318 def report_date(self, upload_date):
3319 """Report downloading extry"""
3320 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3321
3322 def report_uploader(self, uploader):
3323 """Report downloading extry"""
3324 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3325
3326 def report_title(self, video_title):
3327 """Report downloading extry"""
3328 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3329
3330 def report_extract_vid_page(self, video_page):
3331 """Report information extraction."""
3332 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3333
3334 def _real_extract(self, url):
3335 # Extract id from URL
3336 mobj = re.match(self._VALID_URL, url)
3337 if mobj is None:
3338 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3339 return
3340
3341 post_url = mobj.group(0)
3342 video_id = mobj.group(2)
3343
3344 video_extension = 'flv'
3345
3346 # Step 1, Retrieve post webpage to extract further information
3347 self.report_extract_entry(post_url)
3348 request = urllib2.Request(post_url)
3349 try:
3350 webpage = urllib2.urlopen(request).read()
3351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3353 return
3354
3355 # Extract update date
3356 upload_date = u'NA'
3357 pattern = 'title="Timestamp">(.*?)</a>'
3358 mobj = re.search(pattern, webpage)
3359 if mobj:
3360 upload_date = mobj.group(1)
3361 # Convert timestring to a format suitable for filename
3362 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3363 upload_date = upload_date.strftime('%Y%m%d')
3364 self.report_date(upload_date)
3365
3366 # Extract uploader
3367 uploader = u'NA'
3368 pattern = r'rel\="author".*?>(.*?)</a>'
3369 mobj = re.search(pattern, webpage)
3370 if mobj:
3371 uploader = mobj.group(1)
3372 self.report_uploader(uploader)
3373
3374 # Extract title
3375 # Get the first line for title
3376 video_title = u'NA'
3377 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3378 mobj = re.search(pattern, webpage)
3379 if mobj:
3380 video_title = mobj.group(1)
3381 self.report_title(video_title)
3382
3383 # Step 2, Stimulate clicking the image box to launch video
3384 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3385 mobj = re.search(pattern, webpage)
3386 if mobj is None:
3387 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3388
3389 video_page = mobj.group(1)
3390 request = urllib2.Request(video_page)
3391 try:
3392 webpage = urllib2.urlopen(request).read()
3393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3394 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3395 return
3396 self.report_extract_vid_page(video_page)
3397
3398
3399 # Extract video links on video page
3400 """Extract video links of all sizes"""
3401 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3402 mobj = re.findall(pattern, webpage)
3403 if len(mobj) == 0:
3404 self._downloader.trouble(u'ERROR: unable to extract video links')
3405
3406 # Sort in resolution
3407 links = sorted(mobj)
3408
3409 # Choose the lowest of the sort, i.e. highest resolution
3410 video_url = links[-1]
3411 # Only get the url. The resolution part in the tuple has no use anymore
3412 video_url = video_url[-1]
3413 # Treat escaped \u0026 style hex
3414 video_url = unicode(video_url, "unicode_escape")
3415
3416
3417 return [{
3418 'id': video_id.decode('utf-8'),
3419 'url': video_url,
3420 'uploader': uploader.decode('utf-8'),
3421 'upload_date': upload_date.decode('utf-8'),
3422 'title': video_title.decode('utf-8'),
3423 'ext': video_extension.decode('utf-8'),
3424 'player_url': None,
3425 }]