]> jfr.im git - yt-dlp.git/blob - youtube-dl
Request page compression by default, like Firefox does
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
8 import cookielib
9 import datetime
10 import htmlentitydefs
11 import httplib
12 import locale
13 import math
14 import netrc
15 import os
16 import os.path
17 import re
18 import socket
19 import string
20 import subprocess
21 import sys
22 import time
23 import urllib
24 import urllib2
25
26 # parse_qs was moved from the cgi module to the urlparse module recently.
27 try:
28 from urlparse import parse_qs
29 except ImportError:
30 from cgi import parse_qs
31
32 std_headers = {
33 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 'Accept-Encoding': 'gzip, deflate',
37 'Accept-Language': 'en-us,en;q=0.5',
38 }
39
40 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
41
42 def preferredencoding():
43 """Get preferred encoding.
44
45 Returns the best encoding scheme for the system, based on
46 locale.getpreferredencoding() and some further tweaks.
47 """
48 def yield_preferredencoding():
49 try:
50 pref = locale.getpreferredencoding()
51 u'TEST'.encode(pref)
52 except:
53 pref = 'UTF-8'
54 while True:
55 yield pref
56 return yield_preferredencoding().next()
57
58 def htmlentity_transform(matchobj):
59 """Transforms an HTML entity to a Unicode character.
60
61 This function receives a match object and is intended to be used with
62 the re.sub() function.
63 """
64 entity = matchobj.group(1)
65
66 # Known non-numeric HTML entity
67 if entity in htmlentitydefs.name2codepoint:
68 return unichr(htmlentitydefs.name2codepoint[entity])
69
70 # Unicode character
71 mobj = re.match(ur'(?u)#(x?\d+)', entity)
72 if mobj is not None:
73 numstr = mobj.group(1)
74 if numstr.startswith(u'x'):
75 base = 16
76 numstr = u'0%s' % numstr
77 else:
78 base = 10
79 return unichr(long(numstr, base))
80
81 # Unknown entity in name, return its literal representation
82 return (u'&%s;' % entity)
83
84 def sanitize_title(utitle):
85 """Sanitizes a video title so it could be used as part of a filename."""
86 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
87 return utitle.replace(unicode(os.sep), u'%')
88
89 def sanitize_open(filename, open_mode):
90 """Try to open the given filename, and slightly tweak it if this fails.
91
92 Attempts to open the given filename. If this fails, it tries to change
93 the filename slightly, step by step, until it's either able to open it
94 or it fails and raises a final exception, like the standard open()
95 function.
96
97 It returns the tuple (stream, definitive_file_name).
98 """
99 try:
100 if filename == u'-':
101 if sys.platform == 'win32':
102 import msvcrt
103 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
104 return (sys.stdout, filename)
105 stream = open(filename, open_mode)
106 return (stream, filename)
107 except (IOError, OSError), err:
108 # In case of error, try to remove win32 forbidden chars
109 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
110
111 # An exception here should be caught in the caller
112 stream = open(filename, open_mode)
113 return (stream, filename)
114
115 class DownloadError(Exception):
116 """Download Error exception.
117
118 This exception may be thrown by FileDownloader objects if they are not
119 configured to continue on errors. They will contain the appropriate
120 error message.
121 """
122 pass
123
124 class SameFileError(Exception):
125 """Same File exception.
126
127 This exception will be thrown by FileDownloader objects if they detect
128 multiple files would have to be downloaded to the same file on disk.
129 """
130 pass
131
132 class PostProcessingError(Exception):
133 """Post Processing exception.
134
135 This exception may be raised by PostProcessor's .run() method to
136 indicate an error in the postprocessing task.
137 """
138 pass
139
140 class UnavailableVideoError(Exception):
141 """Unavailable Format exception.
142
143 This exception will be thrown when a video is requested
144 in a format that is not available for that video.
145 """
146 pass
147
148 class ContentTooShortError(Exception):
149 """Content Too Short exception.
150
151 This exception may be raised by FileDownloader objects when a file they
152 download is too small for what the server announced first, indicating
153 the connection was probably interrupted.
154 """
155 # Both in bytes
156 downloaded = None
157 expected = None
158
159 def __init__(self, downloaded, expected):
160 self.downloaded = downloaded
161 self.expected = expected
162
163 class FileDownloader(object):
164 """File Downloader class.
165
166 File downloader objects are the ones responsible of downloading the
167 actual video file and writing it to disk if the user has requested
168 it, among some other tasks. In most cases there should be one per
169 program. As, given a video URL, the downloader doesn't know how to
170 extract all the needed information, task that InfoExtractors do, it
171 has to pass the URL to one of them.
172
173 For this, file downloader objects have a method that allows
174 InfoExtractors to be registered in a given order. When it is passed
175 a URL, the file downloader handles it to the first InfoExtractor it
176 finds that reports being able to handle it. The InfoExtractor extracts
177 all the information about the video or videos the URL refers to, and
178 asks the FileDownloader to process the video information, possibly
179 downloading the video.
180
181 File downloaders accept a lot of parameters. In order not to saturate
182 the object constructor with arguments, it receives a dictionary of
183 options instead. These options are available through the params
184 attribute for the InfoExtractors to use. The FileDownloader also
185 registers itself as the downloader in charge for the InfoExtractors
186 that are added to it, so this is a "mutual registration".
187
188 Available options:
189
190 username: Username for authentication purposes.
191 password: Password for authentication purposes.
192 usenetrc: Use netrc for authentication instead.
193 quiet: Do not print messages to stdout.
194 forceurl: Force printing final URL.
195 forcetitle: Force printing title.
196 forcethumbnail: Force printing thumbnail URL.
197 forcedescription: Force printing description.
198 simulate: Do not download the video files.
199 format: Video format code.
200 format_limit: Highest quality format to try.
201 outtmpl: Template for output names.
202 ignoreerrors: Do not stop on download errors.
203 ratelimit: Download speed limit, in bytes/sec.
204 nooverwrites: Prevent overwriting files.
205 retries: Number of times to retry for HTTP error 5xx
206 continuedl: Try to continue downloads if possible.
207 noprogress: Do not print the progress bar.
208 playliststart: Playlist item to start at.
209 playlistend: Playlist item to end at.
210 logtostderr: Log messages to stderr instead of stdout.
211 """
212
213 params = None
214 _ies = []
215 _pps = []
216 _download_retcode = None
217 _num_downloads = None
218 _screen_file = None
219
220 def __init__(self, params):
221 """Create a FileDownloader object with the given options."""
222 self._ies = []
223 self._pps = []
224 self._download_retcode = 0
225 self._num_downloads = 0
226 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
227 self.params = params
228
229 @staticmethod
230 def pmkdir(filename):
231 """Create directory components in filename. Similar to Unix "mkdir -p"."""
232 components = filename.split(os.sep)
233 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
234 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
235 for dir in aggregate:
236 if not os.path.exists(dir):
237 os.mkdir(dir)
238
239 @staticmethod
240 def temp_name(filename):
241 """Returns a temporary filename for the given filename."""
242 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
243 return filename
244 return filename + u'.part'
245
246 @staticmethod
247 def format_bytes(bytes):
248 if bytes is None:
249 return 'N/A'
250 if type(bytes) is str:
251 bytes = float(bytes)
252 if bytes == 0.0:
253 exponent = 0
254 else:
255 exponent = long(math.log(bytes, 1024.0))
256 suffix = 'bkMGTPEZY'[exponent]
257 converted = float(bytes) / float(1024**exponent)
258 return '%.2f%s' % (converted, suffix)
259
260 @staticmethod
261 def calc_percent(byte_counter, data_len):
262 if data_len is None:
263 return '---.-%'
264 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
265
266 @staticmethod
267 def calc_eta(start, now, total, current):
268 if total is None:
269 return '--:--'
270 dif = now - start
271 if current == 0 or dif < 0.001: # One millisecond
272 return '--:--'
273 rate = float(current) / dif
274 eta = long((float(total) - float(current)) / rate)
275 (eta_mins, eta_secs) = divmod(eta, 60)
276 if eta_mins > 99:
277 return '--:--'
278 return '%02d:%02d' % (eta_mins, eta_secs)
279
280 @staticmethod
281 def calc_speed(start, now, bytes):
282 dif = now - start
283 if bytes == 0 or dif < 0.001: # One millisecond
284 return '%10s' % '---b/s'
285 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
286
287 @staticmethod
288 def best_block_size(elapsed_time, bytes):
289 new_min = max(bytes / 2.0, 1.0)
290 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
291 if elapsed_time < 0.001:
292 return long(new_max)
293 rate = bytes / elapsed_time
294 if rate > new_max:
295 return long(new_max)
296 if rate < new_min:
297 return long(new_min)
298 return long(rate)
299
300 @staticmethod
301 def parse_bytes(bytestr):
302 """Parse a string indicating a byte quantity into a long integer."""
303 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
304 if matchobj is None:
305 return None
306 number = float(matchobj.group(1))
307 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
308 return long(round(number * multiplier))
309
310 def add_info_extractor(self, ie):
311 """Add an InfoExtractor object to the end of the list."""
312 self._ies.append(ie)
313 ie.set_downloader(self)
314
315 def add_post_processor(self, pp):
316 """Add a PostProcessor object to the end of the chain."""
317 self._pps.append(pp)
318 pp.set_downloader(self)
319
320 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
321 """Print message to stdout if not in quiet mode."""
322 try:
323 if not self.params.get('quiet', False):
324 terminator = [u'\n', u''][skip_eol]
325 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
326 self._screen_file.flush()
327 except (UnicodeEncodeError), err:
328 if not ignore_encoding_errors:
329 raise
330
331 def to_stderr(self, message):
332 """Print message to stderr."""
333 print >>sys.stderr, message.encode(preferredencoding())
334
335 def fixed_template(self):
336 """Checks if the output template is fixed."""
337 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
338
339 def trouble(self, message=None):
340 """Determine action to take when a download problem appears.
341
342 Depending on if the downloader has been configured to ignore
343 download errors or not, this method may throw an exception or
344 not when errors are found, after printing the message.
345 """
346 if message is not None:
347 self.to_stderr(message)
348 if not self.params.get('ignoreerrors', False):
349 raise DownloadError(message)
350 self._download_retcode = 1
351
352 def slow_down(self, start_time, byte_counter):
353 """Sleep if the download speed is over the rate limit."""
354 rate_limit = self.params.get('ratelimit', None)
355 if rate_limit is None or byte_counter == 0:
356 return
357 now = time.time()
358 elapsed = now - start_time
359 if elapsed <= 0.0:
360 return
361 speed = float(byte_counter) / elapsed
362 if speed > rate_limit:
363 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
364
365 def try_rename(self, old_filename, new_filename):
366 try:
367 if old_filename == new_filename:
368 return
369 os.rename(old_filename, new_filename)
370 except (IOError, OSError), err:
371 self.trouble(u'ERROR: unable to rename file')
372
373 def report_destination(self, filename):
374 """Report destination filename."""
375 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
376
377 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
378 """Report download progress."""
379 if self.params.get('noprogress', False):
380 return
381 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
382 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
383
384 def report_resuming_byte(self, resume_len):
385 """Report attempt to resume at given byte."""
386 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
387
388 def report_retry(self, count, retries):
389 """Report retry in case of HTTP error 5xx"""
390 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
391
392 def report_file_already_downloaded(self, file_name):
393 """Report file has already been fully downloaded."""
394 try:
395 self.to_screen(u'[download] %s has already been downloaded' % file_name)
396 except (UnicodeEncodeError), err:
397 self.to_screen(u'[download] The file has already been downloaded')
398
399 def report_unable_to_resume(self):
400 """Report it was impossible to resume download."""
401 self.to_screen(u'[download] Unable to resume')
402
403 def report_finish(self):
404 """Report download finished."""
405 if self.params.get('noprogress', False):
406 self.to_screen(u'[download] Download completed')
407 else:
408 self.to_screen(u'')
409
410 def increment_downloads(self):
411 """Increment the ordinal that assigns a number to each file."""
412 self._num_downloads += 1
413
414 def process_info(self, info_dict):
415 """Process a single dictionary returned by an InfoExtractor."""
416 # Do nothing else if in simulate mode
417 if self.params.get('simulate', False):
418 # Forced printings
419 if self.params.get('forcetitle', False):
420 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
421 if self.params.get('forceurl', False):
422 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
423 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
424 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
425 if self.params.get('forcedescription', False) and 'description' in info_dict:
426 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
427
428 return
429
430 try:
431 template_dict = dict(info_dict)
432 template_dict['epoch'] = unicode(long(time.time()))
433 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
434 filename = self.params['outtmpl'] % template_dict
435 except (ValueError, KeyError), err:
436 self.trouble(u'ERROR: invalid system charset or erroneous output template')
437 return
438 if self.params.get('nooverwrites', False) and os.path.exists(filename):
439 self.to_stderr(u'WARNING: file exists and will be skipped')
440 return
441
442 try:
443 self.pmkdir(filename)
444 except (OSError, IOError), err:
445 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
446 return
447
448 try:
449 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
450 except (OSError, IOError), err:
451 raise UnavailableVideoError
452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
453 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
454 return
455 except (ContentTooShortError, ), err:
456 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
457 return
458
459 if success:
460 try:
461 self.post_process(filename, info_dict)
462 except (PostProcessingError), err:
463 self.trouble(u'ERROR: postprocessing: %s' % str(err))
464 return
465
466 def download(self, url_list):
467 """Download a given list of URLs."""
468 if len(url_list) > 1 and self.fixed_template():
469 raise SameFileError(self.params['outtmpl'])
470
471 for url in url_list:
472 suitable_found = False
473 for ie in self._ies:
474 # Go to next InfoExtractor if not suitable
475 if not ie.suitable(url):
476 continue
477
478 # Suitable InfoExtractor found
479 suitable_found = True
480
481 # Extract information from URL and process it
482 ie.extract(url)
483
484 # Suitable InfoExtractor had been found; go to next URL
485 break
486
487 if not suitable_found:
488 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
489
490 return self._download_retcode
491
492 def post_process(self, filename, ie_info):
493 """Run the postprocessing chain on the given file."""
494 info = dict(ie_info)
495 info['filepath'] = filename
496 for pp in self._pps:
497 info = pp.run(info)
498 if info is None:
499 break
500
501 def _download_with_rtmpdump(self, filename, url, player_url):
502 self.report_destination(filename)
503 tmpfilename = self.temp_name(filename)
504
505 # Check for rtmpdump first
506 try:
507 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
508 except (OSError, IOError):
509 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
510 return False
511
512 # Download using rtmpdump. rtmpdump returns exit code 2 when
513 # the connection was interrumpted and resuming appears to be
514 # possible. This is part of rtmpdump's normal usage, AFAIK.
515 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
516 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
517 while retval == 2 or retval == 1:
518 prevsize = os.path.getsize(tmpfilename)
519 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
520 time.sleep(5.0) # This seems to be needed
521 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
522 cursize = os.path.getsize(tmpfilename)
523 if prevsize == cursize and retval == 1:
524 break
525 if retval == 0:
526 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
527 self.try_rename(tmpfilename, filename)
528 return True
529 else:
530 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
531 return False
532
533 def _do_download(self, filename, url, player_url):
534 # Check file already present
535 if self.params.get('continuedl', False) and os.path.isfile(filename):
536 self.report_file_already_downloaded(filename)
537 return True
538
539 # Attempt to download using rtmpdump
540 if url.startswith('rtmp'):
541 return self._download_with_rtmpdump(filename, url, player_url)
542
543 tmpfilename = self.temp_name(filename)
544 stream = None
545 open_mode = 'wb'
546 basic_request = urllib2.Request(url, None, std_headers)
547 request = urllib2.Request(url, None, std_headers)
548
549 # Establish possible resume length
550 if os.path.isfile(tmpfilename):
551 resume_len = os.path.getsize(tmpfilename)
552 else:
553 resume_len = 0
554
555 # Request parameters in case of being able to resume
556 if self.params.get('continuedl', False) and resume_len != 0:
557 self.report_resuming_byte(resume_len)
558 request.add_header('Range','bytes=%d-' % resume_len)
559 open_mode = 'ab'
560
561 count = 0
562 retries = self.params.get('retries', 0)
563 while count <= retries:
564 # Establish connection
565 try:
566 data = urllib2.urlopen(request)
567 break
568 except (urllib2.HTTPError, ), err:
569 if (err.code < 500 or err.code >= 600) and err.code != 416:
570 # Unexpected HTTP error
571 raise
572 elif err.code == 416:
573 # Unable to resume (requested range not satisfiable)
574 try:
575 # Open the connection again without the range header
576 data = urllib2.urlopen(basic_request)
577 content_length = data.info()['Content-Length']
578 except (urllib2.HTTPError, ), err:
579 if err.code < 500 or err.code >= 600:
580 raise
581 else:
582 # Examine the reported length
583 if (content_length is not None and
584 (resume_len - 100 < long(content_length) < resume_len + 100)):
585 # The file had already been fully downloaded.
586 # Explanation to the above condition: in issue #175 it was revealed that
587 # YouTube sometimes adds or removes a few bytes from the end of the file,
588 # changing the file size slightly and causing problems for some users. So
589 # I decided to implement a suggested change and consider the file
590 # completely downloaded if the file size differs less than 100 bytes from
591 # the one in the hard drive.
592 self.report_file_already_downloaded(filename)
593 self.try_rename(tmpfilename, filename)
594 return True
595 else:
596 # The length does not match, we start the download over
597 self.report_unable_to_resume()
598 open_mode = 'wb'
599 break
600 # Retry
601 count += 1
602 if count <= retries:
603 self.report_retry(count, retries)
604
605 if count > retries:
606 self.trouble(u'ERROR: giving up after %s retries' % retries)
607 return False
608
609 data_len = data.info().get('Content-length', None)
610 if data_len is not None:
611 data_len = long(data_len) + resume_len
612 data_len_str = self.format_bytes(data_len)
613 byte_counter = 0 + resume_len
614 block_size = 1024
615 start = time.time()
616 while True:
617 # Download and write
618 before = time.time()
619 data_block = data.read(block_size)
620 after = time.time()
621 if len(data_block) == 0:
622 break
623 byte_counter += len(data_block)
624
625 # Open file just in time
626 if stream is None:
627 try:
628 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
629 self.report_destination(filename)
630 except (OSError, IOError), err:
631 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
632 return False
633 try:
634 stream.write(data_block)
635 except (IOError, OSError), err:
636 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
637 return False
638 block_size = self.best_block_size(after - before, len(data_block))
639
640 # Progress message
641 percent_str = self.calc_percent(byte_counter, data_len)
642 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
643 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
644 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
645
646 # Apply rate limit
647 self.slow_down(start, byte_counter - resume_len)
648
649 stream.close()
650 self.report_finish()
651 if data_len is not None and byte_counter != data_len:
652 raise ContentTooShortError(byte_counter, long(data_len))
653 self.try_rename(tmpfilename, filename)
654 return True
655
656 class InfoExtractor(object):
657 """Information Extractor class.
658
659 Information extractors are the classes that, given a URL, extract
660 information from the video (or videos) the URL refers to. This
661 information includes the real video URL, the video title and simplified
662 title, author and others. The information is stored in a dictionary
663 which is then passed to the FileDownloader. The FileDownloader
664 processes this information possibly downloading the video to the file
665 system, among other possible outcomes. The dictionaries must include
666 the following fields:
667
668 id: Video identifier.
669 url: Final video URL.
670 uploader: Nickname of the video uploader.
671 title: Literal title.
672 stitle: Simplified title.
673 ext: Video filename extension.
674 format: Video format.
675 player_url: SWF Player URL (may be None).
676
677 The following fields are optional. Their primary purpose is to allow
678 youtube-dl to serve as the backend for a video search function, such
679 as the one in youtube2mp3. They are only used when their respective
680 forced printing functions are called:
681
682 thumbnail: Full URL to a video thumbnail image.
683 description: One-line video description.
684
685 Subclasses of this one should re-define the _real_initialize() and
686 _real_extract() methods, as well as the suitable() static method.
687 Probably, they should also be instantiated and added to the main
688 downloader.
689 """
690
691 _ready = False
692 _downloader = None
693
694 def __init__(self, downloader=None):
695 """Constructor. Receives an optional downloader."""
696 self._ready = False
697 self.set_downloader(downloader)
698
699 @staticmethod
700 def suitable(url):
701 """Receives a URL and returns True if suitable for this IE."""
702 return False
703
704 def initialize(self):
705 """Initializes an instance (authentication, etc)."""
706 if not self._ready:
707 self._real_initialize()
708 self._ready = True
709
710 def extract(self, url):
711 """Extracts URL information and returns it in list of dicts."""
712 self.initialize()
713 return self._real_extract(url)
714
715 def set_downloader(self, downloader):
716 """Sets the downloader for this IE."""
717 self._downloader = downloader
718
719 def _real_initialize(self):
720 """Real initialization process. Redefine in subclasses."""
721 pass
722
723 def _real_extract(self, url):
724 """Real extraction process. Redefine in subclasses."""
725 pass
726
727 class YoutubeIE(InfoExtractor):
728 """Information extractor for youtube.com."""
729
730 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
731 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
732 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
733 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
734 _NETRC_MACHINE = 'youtube'
735 # Listed in order of quality
736 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
737 _video_extensions = {
738 '13': '3gp',
739 '17': 'mp4',
740 '18': 'mp4',
741 '22': 'mp4',
742 '37': 'mp4',
743 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
744 '43': 'webm',
745 '45': 'webm',
746 }
747
748 @staticmethod
749 def suitable(url):
750 return (re.match(YoutubeIE._VALID_URL, url) is not None)
751
752 def report_lang(self):
753 """Report attempt to set language."""
754 self._downloader.to_screen(u'[youtube] Setting language')
755
756 def report_login(self):
757 """Report attempt to log in."""
758 self._downloader.to_screen(u'[youtube] Logging in')
759
760 def report_age_confirmation(self):
761 """Report attempt to confirm age."""
762 self._downloader.to_screen(u'[youtube] Confirming age')
763
764 def report_video_webpage_download(self, video_id):
765 """Report attempt to download video webpage."""
766 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
767
768 def report_video_info_webpage_download(self, video_id):
769 """Report attempt to download video info webpage."""
770 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
771
772 def report_information_extraction(self, video_id):
773 """Report attempt to extract video information."""
774 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
775
776 def report_unavailable_format(self, video_id, format):
777 """Report extracted video URL."""
778 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
779
780 def report_rtmp_download(self):
781 """Indicate the download will use the RTMP protocol."""
782 self._downloader.to_screen(u'[youtube] RTMP download detected')
783
784 def _real_initialize(self):
785 if self._downloader is None:
786 return
787
788 username = None
789 password = None
790 downloader_params = self._downloader.params
791
792 # Attempt to use provided username and password or .netrc data
793 if downloader_params.get('username', None) is not None:
794 username = downloader_params['username']
795 password = downloader_params['password']
796 elif downloader_params.get('usenetrc', False):
797 try:
798 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
799 if info is not None:
800 username = info[0]
801 password = info[2]
802 else:
803 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
804 except (IOError, netrc.NetrcParseError), err:
805 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
806 return
807
808 # Set language
809 request = urllib2.Request(self._LANG_URL, None, std_headers)
810 try:
811 self.report_lang()
812 urllib2.urlopen(request).read()
813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
814 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
815 return
816
817 # No authentication to be performed
818 if username is None:
819 return
820
821 # Log in
822 login_form = {
823 'current_form': 'loginForm',
824 'next': '/',
825 'action_login': 'Log In',
826 'username': username,
827 'password': password,
828 }
829 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
830 try:
831 self.report_login()
832 login_results = urllib2.urlopen(request).read()
833 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
834 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
835 return
836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
837 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
838 return
839
840 # Confirm age
841 age_form = {
842 'next_url': '/',
843 'action_confirm': 'Confirm',
844 }
845 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
846 try:
847 self.report_age_confirmation()
848 age_results = urllib2.urlopen(request).read()
849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
851 return
852
853 def _real_extract(self, url):
854 # Extract video id from URL
855 mobj = re.match(self._VALID_URL, url)
856 if mobj is None:
857 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
858 return
859 video_id = mobj.group(2)
860
861 # Get video webpage
862 self.report_video_webpage_download(video_id)
863 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
864 try:
865 video_webpage = urllib2.urlopen(request).read()
866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
868 return
869
870 # Attempt to extract SWF player URL
871 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
872 if mobj is not None:
873 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
874 else:
875 player_url = None
876
877 # Get video info
878 self.report_video_info_webpage_download(video_id)
879 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
880 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
881 % (video_id, el_type))
882 request = urllib2.Request(video_info_url, None, std_headers)
883 try:
884 video_info_webpage = urllib2.urlopen(request).read()
885 video_info = parse_qs(video_info_webpage)
886 if 'token' in video_info:
887 break
888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
889 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
890 return
891 if 'token' not in video_info:
892 if 'reason' in video_info:
893 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
894 else:
895 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
896 return
897
898 # Start extracting information
899 self.report_information_extraction(video_id)
900
901 # uploader
902 if 'author' not in video_info:
903 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
904 return
905 video_uploader = urllib.unquote_plus(video_info['author'][0])
906
907 # title
908 if 'title' not in video_info:
909 self._downloader.trouble(u'ERROR: unable to extract video title')
910 return
911 video_title = urllib.unquote_plus(video_info['title'][0])
912 video_title = video_title.decode('utf-8')
913 video_title = sanitize_title(video_title)
914
915 # simplified title
916 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
917 simple_title = simple_title.strip(ur'_')
918
919 # thumbnail image
920 if 'thumbnail_url' not in video_info:
921 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
922 video_thumbnail = ''
923 else: # don't panic if we can't find it
924 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
925
926 # upload date
927 upload_date = u'NA'
928 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
929 if mobj is not None:
930 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
931 format_expressions = ['%d %B %Y', '%B %d %Y']
932 for expression in format_expressions:
933 try:
934 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
935 except:
936 pass
937
938 # description
939 video_description = 'No description available.'
940 if self._downloader.params.get('forcedescription', False):
941 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
942 if mobj is not None:
943 video_description = mobj.group(1)
944
945 # token
946 video_token = urllib.unquote_plus(video_info['token'][0])
947
948 # Decide which formats to download
949 req_format = self._downloader.params.get('format', None)
950
951 if 'fmt_url_map' in video_info:
952 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
953 format_limit = self._downloader.params.get('format_limit', None)
954 if format_limit is not None and format_limit in self._available_formats:
955 format_list = self._available_formats[self._available_formats.index(format_limit):]
956 else:
957 format_list = self._available_formats
958 existing_formats = [x for x in format_list if x in url_map]
959 if len(existing_formats) == 0:
960 self._downloader.trouble(u'ERROR: no known formats available for video')
961 return
962 if req_format is None:
963 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
964 elif req_format == '-1':
965 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
966 else:
967 # Specific format
968 if req_format not in url_map:
969 self._downloader.trouble(u'ERROR: requested format not available')
970 return
971 video_url_list = [(req_format, url_map[req_format])] # Specific format
972
973 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
974 self.report_rtmp_download()
975 video_url_list = [(None, video_info['conn'][0])]
976
977 else:
978 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
979 return
980
981 for format_param, video_real_url in video_url_list:
982 # At this point we have a new video
983 self._downloader.increment_downloads()
984
985 # Extension
986 video_extension = self._video_extensions.get(format_param, 'flv')
987
988 # Find the video URL in fmt_url_map or conn paramters
989 try:
990 # Process video information
991 self._downloader.process_info({
992 'id': video_id.decode('utf-8'),
993 'url': video_real_url.decode('utf-8'),
994 'uploader': video_uploader.decode('utf-8'),
995 'upload_date': upload_date,
996 'title': video_title,
997 'stitle': simple_title,
998 'ext': video_extension.decode('utf-8'),
999 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1000 'thumbnail': video_thumbnail.decode('utf-8'),
1001 'description': video_description.decode('utf-8'),
1002 'player_url': player_url,
1003 })
1004 except UnavailableVideoError, err:
1005 self._downloader.trouble(u'ERROR: unable to download video')
1006
1007
1008 class MetacafeIE(InfoExtractor):
1009 """Information Extractor for metacafe.com."""
1010
1011 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1012 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1013 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1014 _youtube_ie = None
1015
1016 def __init__(self, youtube_ie, downloader=None):
1017 InfoExtractor.__init__(self, downloader)
1018 self._youtube_ie = youtube_ie
1019
1020 @staticmethod
1021 def suitable(url):
1022 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1023
1024 def report_disclaimer(self):
1025 """Report disclaimer retrieval."""
1026 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1027
1028 def report_age_confirmation(self):
1029 """Report attempt to confirm age."""
1030 self._downloader.to_screen(u'[metacafe] Confirming age')
1031
1032 def report_download_webpage(self, video_id):
1033 """Report webpage download."""
1034 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1035
1036 def report_extraction(self, video_id):
1037 """Report information extraction."""
1038 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1039
1040 def _real_initialize(self):
1041 # Retrieve disclaimer
1042 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1043 try:
1044 self.report_disclaimer()
1045 disclaimer = urllib2.urlopen(request).read()
1046 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1047 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1048 return
1049
1050 # Confirm age
1051 disclaimer_form = {
1052 'filters': '0',
1053 'submit': "Continue - I'm over 18",
1054 }
1055 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1056 try:
1057 self.report_age_confirmation()
1058 disclaimer = urllib2.urlopen(request).read()
1059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1060 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1061 return
1062
1063 def _real_extract(self, url):
1064 # Extract id and simplified title from URL
1065 mobj = re.match(self._VALID_URL, url)
1066 if mobj is None:
1067 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1068 return
1069
1070 video_id = mobj.group(1)
1071
1072 # Check if video comes from YouTube
1073 mobj2 = re.match(r'^yt-(.*)$', video_id)
1074 if mobj2 is not None:
1075 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1076 return
1077
1078 # At this point we have a new video
1079 self._downloader.increment_downloads()
1080
1081 simple_title = mobj.group(2).decode('utf-8')
1082
1083 # Retrieve video webpage to extract further information
1084 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1085 try:
1086 self.report_download_webpage(video_id)
1087 webpage = urllib2.urlopen(request).read()
1088 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1089 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1090 return
1091
1092 # Extract URL, uploader and title from webpage
1093 self.report_extraction(video_id)
1094 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1095 if mobj is not None:
1096 mediaURL = urllib.unquote(mobj.group(1))
1097 video_extension = mediaURL[-3:]
1098
1099 # Extract gdaKey if available
1100 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1101 if mobj is None:
1102 video_url = mediaURL
1103 else:
1104 gdaKey = mobj.group(1)
1105 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1106 else:
1107 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1108 if mobj is None:
1109 self._downloader.trouble(u'ERROR: unable to extract media URL')
1110 return
1111 vardict = parse_qs(mobj.group(1))
1112 if 'mediaData' not in vardict:
1113 self._downloader.trouble(u'ERROR: unable to extract media URL')
1114 return
1115 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1116 if mobj is None:
1117 self._downloader.trouble(u'ERROR: unable to extract media URL')
1118 return
1119 mediaURL = mobj.group(1).replace('\\/', '/')
1120 video_extension = mediaURL[-3:]
1121 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1122
1123 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1124 if mobj is None:
1125 self._downloader.trouble(u'ERROR: unable to extract title')
1126 return
1127 video_title = mobj.group(1).decode('utf-8')
1128 video_title = sanitize_title(video_title)
1129
1130 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1131 if mobj is None:
1132 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1133 return
1134 video_uploader = mobj.group(1)
1135
1136 try:
1137 # Process video information
1138 self._downloader.process_info({
1139 'id': video_id.decode('utf-8'),
1140 'url': video_url.decode('utf-8'),
1141 'uploader': video_uploader.decode('utf-8'),
1142 'upload_date': u'NA',
1143 'title': video_title,
1144 'stitle': simple_title,
1145 'ext': video_extension.decode('utf-8'),
1146 'format': u'NA',
1147 'player_url': None,
1148 })
1149 except UnavailableVideoError:
1150 self._downloader.trouble(u'ERROR: unable to download video')
1151
1152
1153 class DailymotionIE(InfoExtractor):
1154 """Information Extractor for Dailymotion"""
1155
1156 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1157
1158 def __init__(self, downloader=None):
1159 InfoExtractor.__init__(self, downloader)
1160
1161 @staticmethod
1162 def suitable(url):
1163 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1164
1165 def report_download_webpage(self, video_id):
1166 """Report webpage download."""
1167 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1168
1169 def report_extraction(self, video_id):
1170 """Report information extraction."""
1171 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1172
1173 def _real_initialize(self):
1174 return
1175
1176 def _real_extract(self, url):
1177 # Extract id and simplified title from URL
1178 mobj = re.match(self._VALID_URL, url)
1179 if mobj is None:
1180 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1181 return
1182
1183 # At this point we have a new video
1184 self._downloader.increment_downloads()
1185 video_id = mobj.group(1)
1186
1187 simple_title = mobj.group(2).decode('utf-8')
1188 video_extension = 'flv'
1189
1190 # Retrieve video webpage to extract further information
1191 request = urllib2.Request(url)
1192 try:
1193 self.report_download_webpage(video_id)
1194 webpage = urllib2.urlopen(request).read()
1195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1197 return
1198
1199 # Extract URL, uploader and title from webpage
1200 self.report_extraction(video_id)
1201 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1202 if mobj is None:
1203 self._downloader.trouble(u'ERROR: unable to extract media URL')
1204 return
1205 mediaURL = urllib.unquote(mobj.group(1))
1206
1207 # if needed add http://www.dailymotion.com/ if relative URL
1208
1209 video_url = mediaURL
1210
1211 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1212 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1213 if mobj is None:
1214 self._downloader.trouble(u'ERROR: unable to extract title')
1215 return
1216 video_title = mobj.group(1).decode('utf-8')
1217 video_title = sanitize_title(video_title)
1218
1219 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1220 if mobj is None:
1221 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1222 return
1223 video_uploader = mobj.group(1)
1224
1225 try:
1226 # Process video information
1227 self._downloader.process_info({
1228 'id': video_id.decode('utf-8'),
1229 'url': video_url.decode('utf-8'),
1230 'uploader': video_uploader.decode('utf-8'),
1231 'upload_date': u'NA',
1232 'title': video_title,
1233 'stitle': simple_title,
1234 'ext': video_extension.decode('utf-8'),
1235 'format': u'NA',
1236 'player_url': None,
1237 })
1238 except UnavailableVideoError:
1239 self._downloader.trouble(u'ERROR: unable to download video')
1240
1241 class GoogleIE(InfoExtractor):
1242 """Information extractor for video.google.com."""
1243
1244 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1245
1246 def __init__(self, downloader=None):
1247 InfoExtractor.__init__(self, downloader)
1248
1249 @staticmethod
1250 def suitable(url):
1251 return (re.match(GoogleIE._VALID_URL, url) is not None)
1252
1253 def report_download_webpage(self, video_id):
1254 """Report webpage download."""
1255 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1256
1257 def report_extraction(self, video_id):
1258 """Report information extraction."""
1259 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1260
1261 def _real_initialize(self):
1262 return
1263
1264 def _real_extract(self, url):
1265 # Extract id from URL
1266 mobj = re.match(self._VALID_URL, url)
1267 if mobj is None:
1268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1269 return
1270
1271 # At this point we have a new video
1272 self._downloader.increment_downloads()
1273 video_id = mobj.group(1)
1274
1275 video_extension = 'mp4'
1276
1277 # Retrieve video webpage to extract further information
1278 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1279 try:
1280 self.report_download_webpage(video_id)
1281 webpage = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1284 return
1285
1286 # Extract URL, uploader, and title from webpage
1287 self.report_extraction(video_id)
1288 mobj = re.search(r"download_url:'([^']+)'", webpage)
1289 if mobj is None:
1290 video_extension = 'flv'
1291 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1292 if mobj is None:
1293 self._downloader.trouble(u'ERROR: unable to extract media URL')
1294 return
1295 mediaURL = urllib.unquote(mobj.group(1))
1296 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1297 mediaURL = mediaURL.replace('\\x26', '\x26')
1298
1299 video_url = mediaURL
1300
1301 mobj = re.search(r'<title>(.*)</title>', webpage)
1302 if mobj is None:
1303 self._downloader.trouble(u'ERROR: unable to extract title')
1304 return
1305 video_title = mobj.group(1).decode('utf-8')
1306 video_title = sanitize_title(video_title)
1307 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1308
1309 # Extract video description
1310 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1311 if mobj is None:
1312 self._downloader.trouble(u'ERROR: unable to extract video description')
1313 return
1314 video_description = mobj.group(1).decode('utf-8')
1315 if not video_description:
1316 video_description = 'No description available.'
1317
1318 # Extract video thumbnail
1319 if self._downloader.params.get('forcethumbnail', False):
1320 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1321 try:
1322 webpage = urllib2.urlopen(request).read()
1323 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1324 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1325 return
1326 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1327 if mobj is None:
1328 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1329 return
1330 video_thumbnail = mobj.group(1)
1331 else: # we need something to pass to process_info
1332 video_thumbnail = ''
1333
1334
1335 try:
1336 # Process video information
1337 self._downloader.process_info({
1338 'id': video_id.decode('utf-8'),
1339 'url': video_url.decode('utf-8'),
1340 'uploader': u'NA',
1341 'upload_date': u'NA',
1342 'title': video_title,
1343 'stitle': simple_title,
1344 'ext': video_extension.decode('utf-8'),
1345 'format': u'NA',
1346 'player_url': None,
1347 })
1348 except UnavailableVideoError:
1349 self._downloader.trouble(u'ERROR: unable to download video')
1350
1351
1352 class PhotobucketIE(InfoExtractor):
1353 """Information extractor for photobucket.com."""
1354
1355 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1356
1357 def __init__(self, downloader=None):
1358 InfoExtractor.__init__(self, downloader)
1359
1360 @staticmethod
1361 def suitable(url):
1362 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1363
1364 def report_download_webpage(self, video_id):
1365 """Report webpage download."""
1366 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1367
1368 def report_extraction(self, video_id):
1369 """Report information extraction."""
1370 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1371
1372 def _real_initialize(self):
1373 return
1374
1375 def _real_extract(self, url):
1376 # Extract id from URL
1377 mobj = re.match(self._VALID_URL, url)
1378 if mobj is None:
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380 return
1381
1382 # At this point we have a new video
1383 self._downloader.increment_downloads()
1384 video_id = mobj.group(1)
1385
1386 video_extension = 'flv'
1387
1388 # Retrieve video webpage to extract further information
1389 request = urllib2.Request(url)
1390 try:
1391 self.report_download_webpage(video_id)
1392 webpage = urllib2.urlopen(request).read()
1393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1395 return
1396
1397 # Extract URL, uploader, and title from webpage
1398 self.report_extraction(video_id)
1399 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1400 if mobj is None:
1401 self._downloader.trouble(u'ERROR: unable to extract media URL')
1402 return
1403 mediaURL = urllib.unquote(mobj.group(1))
1404
1405 video_url = mediaURL
1406
1407 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1408 if mobj is None:
1409 self._downloader.trouble(u'ERROR: unable to extract title')
1410 return
1411 video_title = mobj.group(1).decode('utf-8')
1412 video_title = sanitize_title(video_title)
1413 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1414
1415 video_uploader = mobj.group(2).decode('utf-8')
1416
1417 try:
1418 # Process video information
1419 self._downloader.process_info({
1420 'id': video_id.decode('utf-8'),
1421 'url': video_url.decode('utf-8'),
1422 'uploader': video_uploader,
1423 'upload_date': u'NA',
1424 'title': video_title,
1425 'stitle': simple_title,
1426 'ext': video_extension.decode('utf-8'),
1427 'format': u'NA',
1428 'player_url': None,
1429 })
1430 except UnavailableVideoError:
1431 self._downloader.trouble(u'ERROR: unable to download video')
1432
1433
1434 class YahooIE(InfoExtractor):
1435 """Information extractor for video.yahoo.com."""
1436
1437 # _VALID_URL matches all Yahoo! Video URLs
1438 # _VPAGE_URL matches only the extractable '/watch/' URLs
1439 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1440 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1441
1442 def __init__(self, downloader=None):
1443 InfoExtractor.__init__(self, downloader)
1444
1445 @staticmethod
1446 def suitable(url):
1447 return (re.match(YahooIE._VALID_URL, url) is not None)
1448
1449 def report_download_webpage(self, video_id):
1450 """Report webpage download."""
1451 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1452
1453 def report_extraction(self, video_id):
1454 """Report information extraction."""
1455 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1456
1457 def _real_initialize(self):
1458 return
1459
1460 def _real_extract(self, url, new_video=True):
1461 # Extract ID from URL
1462 mobj = re.match(self._VALID_URL, url)
1463 if mobj is None:
1464 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1465 return
1466
1467 # At this point we have a new video
1468 self._downloader.increment_downloads()
1469 video_id = mobj.group(2)
1470 video_extension = 'flv'
1471
1472 # Rewrite valid but non-extractable URLs as
1473 # extractable English language /watch/ URLs
1474 if re.match(self._VPAGE_URL, url) is None:
1475 request = urllib2.Request(url)
1476 try:
1477 webpage = urllib2.urlopen(request).read()
1478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1479 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1480 return
1481
1482 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1483 if mobj is None:
1484 self._downloader.trouble(u'ERROR: Unable to extract id field')
1485 return
1486 yahoo_id = mobj.group(1)
1487
1488 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1489 if mobj is None:
1490 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1491 return
1492 yahoo_vid = mobj.group(1)
1493
1494 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1495 return self._real_extract(url, new_video=False)
1496
1497 # Retrieve video webpage to extract further information
1498 request = urllib2.Request(url)
1499 try:
1500 self.report_download_webpage(video_id)
1501 webpage = urllib2.urlopen(request).read()
1502 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1504 return
1505
1506 # Extract uploader and title from webpage
1507 self.report_extraction(video_id)
1508 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1509 if mobj is None:
1510 self._downloader.trouble(u'ERROR: unable to extract video title')
1511 return
1512 video_title = mobj.group(1).decode('utf-8')
1513 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1514
1515 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1516 if mobj is None:
1517 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1518 return
1519 video_uploader = mobj.group(1).decode('utf-8')
1520
1521 # Extract video thumbnail
1522 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1523 if mobj is None:
1524 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1525 return
1526 video_thumbnail = mobj.group(1).decode('utf-8')
1527
1528 # Extract video description
1529 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1530 if mobj is None:
1531 self._downloader.trouble(u'ERROR: unable to extract video description')
1532 return
1533 video_description = mobj.group(1).decode('utf-8')
1534 if not video_description: video_description = 'No description available.'
1535
1536 # Extract video height and width
1537 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1538 if mobj is None:
1539 self._downloader.trouble(u'ERROR: unable to extract video height')
1540 return
1541 yv_video_height = mobj.group(1)
1542
1543 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1544 if mobj is None:
1545 self._downloader.trouble(u'ERROR: unable to extract video width')
1546 return
1547 yv_video_width = mobj.group(1)
1548
1549 # Retrieve video playlist to extract media URL
1550 # I'm not completely sure what all these options are, but we
1551 # seem to need most of them, otherwise the server sends a 401.
1552 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1553 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1554 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1555 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1556 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1557 try:
1558 self.report_download_webpage(video_id)
1559 webpage = urllib2.urlopen(request).read()
1560 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1562 return
1563
1564 # Extract media URL from playlist XML
1565 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1566 if mobj is None:
1567 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1568 return
1569 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1570 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1571
1572 try:
1573 # Process video information
1574 self._downloader.process_info({
1575 'id': video_id.decode('utf-8'),
1576 'url': video_url,
1577 'uploader': video_uploader,
1578 'upload_date': u'NA',
1579 'title': video_title,
1580 'stitle': simple_title,
1581 'ext': video_extension.decode('utf-8'),
1582 'thumbnail': video_thumbnail.decode('utf-8'),
1583 'description': video_description,
1584 'thumbnail': video_thumbnail,
1585 'description': video_description,
1586 'player_url': None,
1587 })
1588 except UnavailableVideoError:
1589 self._downloader.trouble(u'ERROR: unable to download video')
1590
1591
1592 class GenericIE(InfoExtractor):
1593 """Generic last-resort information extractor."""
1594
1595 def __init__(self, downloader=None):
1596 InfoExtractor.__init__(self, downloader)
1597
1598 @staticmethod
1599 def suitable(url):
1600 return True
1601
1602 def report_download_webpage(self, video_id):
1603 """Report webpage download."""
1604 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1605 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1606
1607 def report_extraction(self, video_id):
1608 """Report information extraction."""
1609 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1610
1611 def _real_initialize(self):
1612 return
1613
1614 def _real_extract(self, url):
1615 # At this point we have a new video
1616 self._downloader.increment_downloads()
1617
1618 video_id = url.split('/')[-1]
1619 request = urllib2.Request(url)
1620 try:
1621 self.report_download_webpage(video_id)
1622 webpage = urllib2.urlopen(request).read()
1623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1625 return
1626 except ValueError, err:
1627 # since this is the last-resort InfoExtractor, if
1628 # this error is thrown, it'll be thrown here
1629 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1630 return
1631
1632 self.report_extraction(video_id)
1633 # Start with something easy: JW Player in SWFObject
1634 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1635 if mobj is None:
1636 # Broaden the search a little bit
1637 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1638 if mobj is None:
1639 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1640 return
1641
1642 # It's possible that one of the regexes
1643 # matched, but returned an empty group:
1644 if mobj.group(1) is None:
1645 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1646 return
1647
1648 video_url = urllib.unquote(mobj.group(1))
1649 video_id = os.path.basename(video_url)
1650
1651 # here's a fun little line of code for you:
1652 video_extension = os.path.splitext(video_id)[1][1:]
1653 video_id = os.path.splitext(video_id)[0]
1654
1655 # it's tempting to parse this further, but you would
1656 # have to take into account all the variations like
1657 # Video Title - Site Name
1658 # Site Name | Video Title
1659 # Video Title - Tagline | Site Name
1660 # and so on and so forth; it's just not practical
1661 mobj = re.search(r'<title>(.*)</title>', webpage)
1662 if mobj is None:
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1664 return
1665 video_title = mobj.group(1).decode('utf-8')
1666 video_title = sanitize_title(video_title)
1667 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668
1669 # video uploader is domain name
1670 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1671 if mobj is None:
1672 self._downloader.trouble(u'ERROR: unable to extract title')
1673 return
1674 video_uploader = mobj.group(1).decode('utf-8')
1675
1676 try:
1677 # Process video information
1678 self._downloader.process_info({
1679 'id': video_id.decode('utf-8'),
1680 'url': video_url.decode('utf-8'),
1681 'uploader': video_uploader,
1682 'upload_date': u'NA',
1683 'title': video_title,
1684 'stitle': simple_title,
1685 'ext': video_extension.decode('utf-8'),
1686 'format': u'NA',
1687 'player_url': None,
1688 })
1689 except UnavailableVideoError, err:
1690 self._downloader.trouble(u'ERROR: unable to download video')
1691
1692
1693 class YoutubeSearchIE(InfoExtractor):
1694 """Information Extractor for YouTube search queries."""
1695 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1696 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1697 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1698 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1699 _youtube_ie = None
1700 _max_youtube_results = 1000
1701
1702 def __init__(self, youtube_ie, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1704 self._youtube_ie = youtube_ie
1705
1706 @staticmethod
1707 def suitable(url):
1708 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1709
1710 def report_download_page(self, query, pagenum):
1711 """Report attempt to download playlist page with given number."""
1712 query = query.decode(preferredencoding())
1713 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1714
1715 def _real_initialize(self):
1716 self._youtube_ie.initialize()
1717
1718 def _real_extract(self, query):
1719 mobj = re.match(self._VALID_QUERY, query)
1720 if mobj is None:
1721 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1722 return
1723
1724 prefix, query = query.split(':')
1725 prefix = prefix[8:]
1726 query = query.encode('utf-8')
1727 if prefix == '':
1728 self._download_n_results(query, 1)
1729 return
1730 elif prefix == 'all':
1731 self._download_n_results(query, self._max_youtube_results)
1732 return
1733 else:
1734 try:
1735 n = long(prefix)
1736 if n <= 0:
1737 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1738 return
1739 elif n > self._max_youtube_results:
1740 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1741 n = self._max_youtube_results
1742 self._download_n_results(query, n)
1743 return
1744 except ValueError: # parsing prefix as integer fails
1745 self._download_n_results(query, 1)
1746 return
1747
1748 def _download_n_results(self, query, n):
1749 """Downloads a specified number of results for a query"""
1750
1751 video_ids = []
1752 already_seen = set()
1753 pagenum = 1
1754
1755 while True:
1756 self.report_download_page(query, pagenum)
1757 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1758 request = urllib2.Request(result_url, None, std_headers)
1759 try:
1760 page = urllib2.urlopen(request).read()
1761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763 return
1764
1765 # Extract video identifiers
1766 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1767 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1768 if video_id not in already_seen:
1769 video_ids.append(video_id)
1770 already_seen.add(video_id)
1771 if len(video_ids) == n:
1772 # Specified n videos reached
1773 for id in video_ids:
1774 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1775 return
1776
1777 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1778 for id in video_ids:
1779 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1780 return
1781
1782 pagenum = pagenum + 1
1783
1784 class GoogleSearchIE(InfoExtractor):
1785 """Information Extractor for Google Video search queries."""
1786 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1787 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1788 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1789 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1790 _google_ie = None
1791 _max_google_results = 1000
1792
1793 def __init__(self, google_ie, downloader=None):
1794 InfoExtractor.__init__(self, downloader)
1795 self._google_ie = google_ie
1796
1797 @staticmethod
1798 def suitable(url):
1799 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1800
1801 def report_download_page(self, query, pagenum):
1802 """Report attempt to download playlist page with given number."""
1803 query = query.decode(preferredencoding())
1804 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1805
1806 def _real_initialize(self):
1807 self._google_ie.initialize()
1808
1809 def _real_extract(self, query):
1810 mobj = re.match(self._VALID_QUERY, query)
1811 if mobj is None:
1812 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1813 return
1814
1815 prefix, query = query.split(':')
1816 prefix = prefix[8:]
1817 query = query.encode('utf-8')
1818 if prefix == '':
1819 self._download_n_results(query, 1)
1820 return
1821 elif prefix == 'all':
1822 self._download_n_results(query, self._max_google_results)
1823 return
1824 else:
1825 try:
1826 n = long(prefix)
1827 if n <= 0:
1828 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1829 return
1830 elif n > self._max_google_results:
1831 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1832 n = self._max_google_results
1833 self._download_n_results(query, n)
1834 return
1835 except ValueError: # parsing prefix as integer fails
1836 self._download_n_results(query, 1)
1837 return
1838
1839 def _download_n_results(self, query, n):
1840 """Downloads a specified number of results for a query"""
1841
1842 video_ids = []
1843 already_seen = set()
1844 pagenum = 1
1845
1846 while True:
1847 self.report_download_page(query, pagenum)
1848 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1849 request = urllib2.Request(result_url, None, std_headers)
1850 try:
1851 page = urllib2.urlopen(request).read()
1852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854 return
1855
1856 # Extract video identifiers
1857 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1858 video_id = mobj.group(1)
1859 if video_id not in already_seen:
1860 video_ids.append(video_id)
1861 already_seen.add(video_id)
1862 if len(video_ids) == n:
1863 # Specified n videos reached
1864 for id in video_ids:
1865 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1866 return
1867
1868 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1869 for id in video_ids:
1870 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1871 return
1872
1873 pagenum = pagenum + 1
1874
1875 class YahooSearchIE(InfoExtractor):
1876 """Information Extractor for Yahoo! Video search queries."""
1877 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1878 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1879 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1880 _MORE_PAGES_INDICATOR = r'\s*Next'
1881 _yahoo_ie = None
1882 _max_yahoo_results = 1000
1883
1884 def __init__(self, yahoo_ie, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1886 self._yahoo_ie = yahoo_ie
1887
1888 @staticmethod
1889 def suitable(url):
1890 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1891
1892 def report_download_page(self, query, pagenum):
1893 """Report attempt to download playlist page with given number."""
1894 query = query.decode(preferredencoding())
1895 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1896
1897 def _real_initialize(self):
1898 self._yahoo_ie.initialize()
1899
1900 def _real_extract(self, query):
1901 mobj = re.match(self._VALID_QUERY, query)
1902 if mobj is None:
1903 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1904 return
1905
1906 prefix, query = query.split(':')
1907 prefix = prefix[8:]
1908 query = query.encode('utf-8')
1909 if prefix == '':
1910 self._download_n_results(query, 1)
1911 return
1912 elif prefix == 'all':
1913 self._download_n_results(query, self._max_yahoo_results)
1914 return
1915 else:
1916 try:
1917 n = long(prefix)
1918 if n <= 0:
1919 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1920 return
1921 elif n > self._max_yahoo_results:
1922 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1923 n = self._max_yahoo_results
1924 self._download_n_results(query, n)
1925 return
1926 except ValueError: # parsing prefix as integer fails
1927 self._download_n_results(query, 1)
1928 return
1929
1930 def _download_n_results(self, query, n):
1931 """Downloads a specified number of results for a query"""
1932
1933 video_ids = []
1934 already_seen = set()
1935 pagenum = 1
1936
1937 while True:
1938 self.report_download_page(query, pagenum)
1939 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1940 request = urllib2.Request(result_url, None, std_headers)
1941 try:
1942 page = urllib2.urlopen(request).read()
1943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945 return
1946
1947 # Extract video identifiers
1948 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1949 video_id = mobj.group(1)
1950 if video_id not in already_seen:
1951 video_ids.append(video_id)
1952 already_seen.add(video_id)
1953 if len(video_ids) == n:
1954 # Specified n videos reached
1955 for id in video_ids:
1956 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1957 return
1958
1959 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1960 for id in video_ids:
1961 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1962 return
1963
1964 pagenum = pagenum + 1
1965
1966 class YoutubePlaylistIE(InfoExtractor):
1967 """Information Extractor for YouTube playlists."""
1968
1969 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1970 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1971 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1972 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1973 _youtube_ie = None
1974
1975 def __init__(self, youtube_ie, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1977 self._youtube_ie = youtube_ie
1978
1979 @staticmethod
1980 def suitable(url):
1981 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1982
1983 def report_download_page(self, playlist_id, pagenum):
1984 """Report attempt to download playlist page with given number."""
1985 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1986
1987 def _real_initialize(self):
1988 self._youtube_ie.initialize()
1989
1990 def _real_extract(self, url):
1991 # Extract playlist id
1992 mobj = re.match(self._VALID_URL, url)
1993 if mobj is None:
1994 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995 return
1996
1997 # Download playlist pages
1998 playlist_id = mobj.group(1)
1999 video_ids = []
2000 pagenum = 1
2001
2002 while True:
2003 self.report_download_page(playlist_id, pagenum)
2004 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2005 try:
2006 page = urllib2.urlopen(request).read()
2007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2009 return
2010
2011 # Extract video identifiers
2012 ids_in_page = []
2013 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014 if mobj.group(1) not in ids_in_page:
2015 ids_in_page.append(mobj.group(1))
2016 video_ids.extend(ids_in_page)
2017
2018 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2019 break
2020 pagenum = pagenum + 1
2021
2022 playliststart = self._downloader.params.get('playliststart', 1) - 1
2023 playlistend = self._downloader.params.get('playlistend', -1)
2024 video_ids = video_ids[playliststart:playlistend]
2025
2026 for id in video_ids:
2027 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2028 return
2029
2030 class YoutubeUserIE(InfoExtractor):
2031 """Information Extractor for YouTube users."""
2032
2033 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2034 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2035 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2036 _youtube_ie = None
2037
2038 def __init__(self, youtube_ie, downloader=None):
2039 InfoExtractor.__init__(self, downloader)
2040 self._youtube_ie = youtube_ie
2041
2042 @staticmethod
2043 def suitable(url):
2044 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2045
2046 def report_download_page(self, username):
2047 """Report attempt to download user page."""
2048 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2049
2050 def _real_initialize(self):
2051 self._youtube_ie.initialize()
2052
2053 def _real_extract(self, url):
2054 # Extract username
2055 mobj = re.match(self._VALID_URL, url)
2056 if mobj is None:
2057 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2058 return
2059
2060 # Download user page
2061 username = mobj.group(1)
2062 video_ids = []
2063 pagenum = 1
2064
2065 self.report_download_page(username)
2066 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2067 try:
2068 page = urllib2.urlopen(request).read()
2069 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2070 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2071 return
2072
2073 # Extract video identifiers
2074 ids_in_page = []
2075
2076 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2077 if mobj.group(1) not in ids_in_page:
2078 ids_in_page.append(mobj.group(1))
2079 video_ids.extend(ids_in_page)
2080
2081 playliststart = self._downloader.params.get('playliststart', 1) - 1
2082 playlistend = self._downloader.params.get('playlistend', -1)
2083 video_ids = video_ids[playliststart:playlistend]
2084
2085 for id in video_ids:
2086 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2087 return
2088
2089 class DepositFilesIE(InfoExtractor):
2090 """Information extractor for depositfiles.com"""
2091
2092 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2093
2094 def __init__(self, downloader=None):
2095 InfoExtractor.__init__(self, downloader)
2096
2097 @staticmethod
2098 def suitable(url):
2099 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2100
2101 def report_download_webpage(self, file_id):
2102 """Report webpage download."""
2103 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2104
2105 def report_extraction(self, file_id):
2106 """Report information extraction."""
2107 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2108
2109 def _real_initialize(self):
2110 return
2111
2112 def _real_extract(self, url):
2113 # At this point we have a new file
2114 self._downloader.increment_downloads()
2115
2116 file_id = url.split('/')[-1]
2117 # Rebuild url in english locale
2118 url = 'http://depositfiles.com/en/files/' + file_id
2119
2120 # Retrieve file webpage with 'Free download' button pressed
2121 free_download_indication = { 'gateway_result' : '1' }
2122 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2123 try:
2124 self.report_download_webpage(file_id)
2125 webpage = urllib2.urlopen(request).read()
2126 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2127 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2128 return
2129
2130 # Search for the real file URL
2131 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2132 if (mobj is None) or (mobj.group(1) is None):
2133 # Try to figure out reason of the error.
2134 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2135 if (mobj is not None) and (mobj.group(1) is not None):
2136 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2137 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2138 else:
2139 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2140 return
2141
2142 file_url = mobj.group(1)
2143 file_extension = os.path.splitext(file_url)[1][1:]
2144
2145 # Search for file title
2146 mobj = re.search(r'<b title="(.*?)">', webpage)
2147 if mobj is None:
2148 self._downloader.trouble(u'ERROR: unable to extract title')
2149 return
2150 file_title = mobj.group(1).decode('utf-8')
2151
2152 try:
2153 # Process file information
2154 self._downloader.process_info({
2155 'id': file_id.decode('utf-8'),
2156 'url': file_url.decode('utf-8'),
2157 'uploader': u'NA',
2158 'upload_date': u'NA',
2159 'title': file_title,
2160 'stitle': file_title,
2161 'ext': file_extension.decode('utf-8'),
2162 'format': u'NA',
2163 'player_url': None,
2164 })
2165 except UnavailableVideoError, err:
2166 self._downloader.trouble(u'ERROR: unable to download file')
2167
2168 class PostProcessor(object):
2169 """Post Processor class.
2170
2171 PostProcessor objects can be added to downloaders with their
2172 add_post_processor() method. When the downloader has finished a
2173 successful download, it will take its internal chain of PostProcessors
2174 and start calling the run() method on each one of them, first with
2175 an initial argument and then with the returned value of the previous
2176 PostProcessor.
2177
2178 The chain will be stopped if one of them ever returns None or the end
2179 of the chain is reached.
2180
2181 PostProcessor objects follow a "mutual registration" process similar
2182 to InfoExtractor objects.
2183 """
2184
2185 _downloader = None
2186
2187 def __init__(self, downloader=None):
2188 self._downloader = downloader
2189
2190 def set_downloader(self, downloader):
2191 """Sets the downloader for this PP."""
2192 self._downloader = downloader
2193
2194 def run(self, information):
2195 """Run the PostProcessor.
2196
2197 The "information" argument is a dictionary like the ones
2198 composed by InfoExtractors. The only difference is that this
2199 one has an extra field called "filepath" that points to the
2200 downloaded file.
2201
2202 When this method returns None, the postprocessing chain is
2203 stopped. However, this method may return an information
2204 dictionary that will be passed to the next postprocessing
2205 object in the chain. It can be the one it received after
2206 changing some fields.
2207
2208 In addition, this method may raise a PostProcessingError
2209 exception that will be taken into account by the downloader
2210 it was called from.
2211 """
2212 return information # by default, do nothing
2213
2214 ### MAIN PROGRAM ###
2215 if __name__ == '__main__':
2216 try:
2217 # Modules needed only when running the main program
2218 import getpass
2219 import optparse
2220
2221 # Function to update the program file with the latest version from bitbucket.org
2222 def update_self(downloader, filename):
2223 # Note: downloader only used for options
2224 if not os.access (filename, os.W_OK):
2225 sys.exit('ERROR: no write permissions on %s' % filename)
2226
2227 downloader.to_screen('Updating to latest stable version...')
2228 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2229 latest_version = urllib.urlopen(latest_url).read().strip()
2230 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2231 newcontent = urllib.urlopen(prog_url).read()
2232 stream = open(filename, 'w')
2233 stream.write(newcontent)
2234 stream.close()
2235 downloader.to_screen('Updated to version %s' % latest_version)
2236
2237 # Parse command line
2238 parser = optparse.OptionParser(
2239 usage='Usage: %prog [options] url...',
2240 version='2010.12.09',
2241 conflict_handler='resolve',
2242 )
2243
2244 parser.add_option('-h', '--help',
2245 action='help', help='print this help text and exit')
2246 parser.add_option('-v', '--version',
2247 action='version', help='print program version and exit')
2248 parser.add_option('-U', '--update',
2249 action='store_true', dest='update_self', help='update this program to latest stable version')
2250 parser.add_option('-i', '--ignore-errors',
2251 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2252 parser.add_option('-r', '--rate-limit',
2253 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2254 parser.add_option('-R', '--retries',
2255 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2256 parser.add_option('--playlist-start',
2257 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2258 parser.add_option('--playlist-end',
2259 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2260
2261 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2262 authentication.add_option('-u', '--username',
2263 dest='username', metavar='USERNAME', help='account username')
2264 authentication.add_option('-p', '--password',
2265 dest='password', metavar='PASSWORD', help='account password')
2266 authentication.add_option('-n', '--netrc',
2267 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2268 parser.add_option_group(authentication)
2269
2270 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2271 video_format.add_option('-f', '--format',
2272 action='store', dest='format', metavar='FORMAT', help='video format code')
2273 video_format.add_option('--all-formats',
2274 action='store_const', dest='format', help='download all available video formats', const='-1')
2275 video_format.add_option('--max-quality',
2276 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2277 parser.add_option_group(video_format)
2278
2279 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2280 verbosity.add_option('-q', '--quiet',
2281 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2282 verbosity.add_option('-s', '--simulate',
2283 action='store_true', dest='simulate', help='do not download video', default=False)
2284 verbosity.add_option('-g', '--get-url',
2285 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2286 verbosity.add_option('-e', '--get-title',
2287 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2288 verbosity.add_option('--get-thumbnail',
2289 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2290 verbosity.add_option('--get-description',
2291 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2292 verbosity.add_option('--no-progress',
2293 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2294 parser.add_option_group(verbosity)
2295
2296 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2297 filesystem.add_option('-t', '--title',
2298 action='store_true', dest='usetitle', help='use title in file name', default=False)
2299 filesystem.add_option('-l', '--literal',
2300 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2301 filesystem.add_option('-A', '--auto-number',
2302 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2303 filesystem.add_option('-o', '--output',
2304 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2305 filesystem.add_option('-a', '--batch-file',
2306 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2307 filesystem.add_option('-w', '--no-overwrites',
2308 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2309 filesystem.add_option('-c', '--continue',
2310 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2311 filesystem.add_option('--cookies',
2312 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2313 parser.add_option_group(filesystem)
2314
2315 (opts, args) = parser.parse_args()
2316
2317 # Open appropriate CookieJar
2318 if opts.cookiefile is None:
2319 jar = cookielib.CookieJar()
2320 else:
2321 try:
2322 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2323 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2324 jar.load()
2325 except (IOError, OSError), err:
2326 sys.exit(u'ERROR: unable to open cookie file')
2327
2328 # General configuration
2329 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2330 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2331 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2332 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2333
2334 # Batch file verification
2335 batchurls = []
2336 if opts.batchfile is not None:
2337 try:
2338 if opts.batchfile == '-':
2339 batchfd = sys.stdin
2340 else:
2341 batchfd = open(opts.batchfile, 'r')
2342 batchurls = batchfd.readlines()
2343 batchurls = [x.strip() for x in batchurls]
2344 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2345 except IOError:
2346 sys.exit(u'ERROR: batch file could not be read')
2347 all_urls = batchurls + args
2348
2349 # Conflicting, missing and erroneous options
2350 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2351 parser.error(u'using .netrc conflicts with giving username/password')
2352 if opts.password is not None and opts.username is None:
2353 parser.error(u'account username missing')
2354 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2355 parser.error(u'using output template conflicts with using title, literal title or auto number')
2356 if opts.usetitle and opts.useliteral:
2357 parser.error(u'using title conflicts with using literal title')
2358 if opts.username is not None and opts.password is None:
2359 opts.password = getpass.getpass(u'Type account password and press return:')
2360 if opts.ratelimit is not None:
2361 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2362 if numeric_limit is None:
2363 parser.error(u'invalid rate limit specified')
2364 opts.ratelimit = numeric_limit
2365 if opts.retries is not None:
2366 try:
2367 opts.retries = long(opts.retries)
2368 except (TypeError, ValueError), err:
2369 parser.error(u'invalid retry count specified')
2370 try:
2371 opts.playliststart = long(opts.playliststart)
2372 if opts.playliststart <= 0:
2373 raise ValueError
2374 except (TypeError, ValueError), err:
2375 parser.error(u'invalid playlist start number specified')
2376 try:
2377 opts.playlistend = long(opts.playlistend)
2378 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2379 raise ValueError
2380 except (TypeError, ValueError), err:
2381 parser.error(u'invalid playlist end number specified')
2382
2383 # Information extractors
2384 youtube_ie = YoutubeIE()
2385 metacafe_ie = MetacafeIE(youtube_ie)
2386 dailymotion_ie = DailymotionIE()
2387 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2388 youtube_user_ie = YoutubeUserIE(youtube_ie)
2389 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2390 google_ie = GoogleIE()
2391 google_search_ie = GoogleSearchIE(google_ie)
2392 photobucket_ie = PhotobucketIE()
2393 yahoo_ie = YahooIE()
2394 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2395 deposit_files_ie = DepositFilesIE()
2396 generic_ie = GenericIE()
2397
2398 # File downloader
2399 fd = FileDownloader({
2400 'usenetrc': opts.usenetrc,
2401 'username': opts.username,
2402 'password': opts.password,
2403 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2404 'forceurl': opts.geturl,
2405 'forcetitle': opts.gettitle,
2406 'forcethumbnail': opts.getthumbnail,
2407 'forcedescription': opts.getdescription,
2408 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2409 'format': opts.format,
2410 'format_limit': opts.format_limit,
2411 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2412 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2413 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2414 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2415 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2416 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2417 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2418 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2419 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2420 or u'%(id)s.%(ext)s'),
2421 'ignoreerrors': opts.ignoreerrors,
2422 'ratelimit': opts.ratelimit,
2423 'nooverwrites': opts.nooverwrites,
2424 'retries': opts.retries,
2425 'continuedl': opts.continue_dl,
2426 'noprogress': opts.noprogress,
2427 'playliststart': opts.playliststart,
2428 'playlistend': opts.playlistend,
2429 'logtostderr': opts.outtmpl == '-',
2430 })
2431 fd.add_info_extractor(youtube_search_ie)
2432 fd.add_info_extractor(youtube_pl_ie)
2433 fd.add_info_extractor(youtube_user_ie)
2434 fd.add_info_extractor(metacafe_ie)
2435 fd.add_info_extractor(dailymotion_ie)
2436 fd.add_info_extractor(youtube_ie)
2437 fd.add_info_extractor(google_ie)
2438 fd.add_info_extractor(google_search_ie)
2439 fd.add_info_extractor(photobucket_ie)
2440 fd.add_info_extractor(yahoo_ie)
2441 fd.add_info_extractor(yahoo_search_ie)
2442 fd.add_info_extractor(deposit_files_ie)
2443
2444 # This must come last since it's the
2445 # fallback if none of the others work
2446 fd.add_info_extractor(generic_ie)
2447
2448 # Update version
2449 if opts.update_self:
2450 update_self(fd, sys.argv[0])
2451
2452 # Maybe do nothing
2453 if len(all_urls) < 1:
2454 if not opts.update_self:
2455 parser.error(u'you must provide at least one URL')
2456 else:
2457 sys.exit()
2458 retcode = fd.download(all_urls)
2459
2460 # Dump cookie jar if requested
2461 if opts.cookiefile is not None:
2462 try:
2463 jar.save()
2464 except (IOError, OSError), err:
2465 sys.exit(u'ERROR: unable to save cookie jar')
2466
2467 sys.exit(retcode)
2468
2469 except DownloadError:
2470 sys.exit(1)
2471 except SameFileError:
2472 sys.exit(u'ERROR: fixed output name but more than one file to download')
2473 except KeyboardInterrupt:
2474 sys.exit(u'\nERROR: Interrupted by user')