]> jfr.im git - yt-dlp.git/blame - youtube-dl
Added command line switch -A --auto-number
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Author: Ricardo Garcia Gonzalez
64a6f26c 4# Author: Danny Colligan
49c0028a 5# Author: Benjamin Johnson
4fa74b52 6# License: Public domain code
80066952 7import cookielib
4fa74b52
RG
8import htmlentitydefs
9import httplib
2546e767 10import locale
4fa74b52
RG
11import math
12import netrc
13import os
14import os.path
15import re
16import socket
17import string
0487b407 18import subprocess
4fa74b52
RG
19import sys
20import time
21import urllib
22import urllib2
a04e80a4
RG
23
24# parse_qs was moved from the cgi module to the urlparse module recently.
25try:
26 from urlparse import parse_qs
27except ImportError:
28 from cgi import parse_qs
4fa74b52 29
f995f712 30std_headers = {
a6a61601 31 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.11) Gecko/20101019 Firefox/3.6.11',
4fa74b52 32 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 33 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
4fa74b52
RG
34 'Accept-Language': 'en-us,en;q=0.5',
35}
36
37simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
38
eae2666c
RG
39def preferredencoding():
40 """Get preferred encoding.
41
42 Returns the best encoding scheme for the system, based on
43 locale.getpreferredencoding() and some further tweaks.
44 """
f94b636c
RG
45 def yield_preferredencoding():
46 try:
47 pref = locale.getpreferredencoding()
48 u'TEST'.encode(pref)
49 except:
50 pref = 'UTF-8'
51 while True:
52 yield pref
53 return yield_preferredencoding().next()
eae2666c 54
490fd7ae
RG
55def htmlentity_transform(matchobj):
56 """Transforms an HTML entity to a Unicode character.
57
58 This function receives a match object and is intended to be used with
59 the re.sub() function.
60 """
61 entity = matchobj.group(1)
62
63 # Known non-numeric HTML entity
64 if entity in htmlentitydefs.name2codepoint:
65 return unichr(htmlentitydefs.name2codepoint[entity])
66
67 # Unicode character
68 mobj = re.match(ur'(?u)#(x?\d+)', entity)
69 if mobj is not None:
70 numstr = mobj.group(1)
71 if numstr.startswith(u'x'):
72 base = 16
73 numstr = u'0%s' % numstr
74 else:
75 base = 10
76 return unichr(long(numstr, base))
77
78 # Unknown entity in name, return its literal representation
79 return (u'&%s;' % entity)
80
81def sanitize_title(utitle):
31bcb480 82 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 83 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
84 return utitle.replace(unicode(os.sep), u'%')
85
31bcb480
RG
86def sanitize_open(filename, open_mode):
87 """Try to open the given filename, and slightly tweak it if this fails.
88
89 Attempts to open the given filename. If this fails, it tries to change
90 the filename slightly, step by step, until it's either able to open it
91 or it fails and raises a final exception, like the standard open()
92 function.
93
94 It returns the tuple (stream, definitive_file_name).
95 """
96 try:
131bc765 97 if filename == u'-':
e08878f4
RG
98 if sys.platform == 'win32':
99 import msvcrt
100 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 101 return (sys.stdout, filename)
31bcb480
RG
102 stream = open(filename, open_mode)
103 return (stream, filename)
104 except (IOError, OSError), err:
105 # In case of error, try to remove win32 forbidden chars
ca6a11fa 106 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
107
108 # An exception here should be caught in the caller
109 stream = open(filename, open_mode)
110 return (stream, filename)
111
112
e5bf0f55
RG
113class DownloadError(Exception):
114 """Download Error exception.
115
116 This exception may be thrown by FileDownloader objects if they are not
117 configured to continue on errors. They will contain the appropriate
118 error message.
119 """
120 pass
121
122class SameFileError(Exception):
123 """Same File exception.
124
125 This exception will be thrown by FileDownloader objects if they detect
126 multiple files would have to be downloaded to the same file on disk.
127 """
128 pass
129
65cd34c5
RG
130class PostProcessingError(Exception):
131 """Post Processing exception.
132
133 This exception may be raised by PostProcessor's .run() method to
134 indicate an error in the postprocessing task.
135 """
136 pass
137
73f4e7af 138class UnavailableVideoError(Exception):
7b7759f5 139 """Unavailable Format exception.
140
141 This exception will be thrown when a video is requested
142 in a format that is not available for that video.
143 """
d69a1c91
RG
144 pass
145
146class ContentTooShortError(Exception):
147 """Content Too Short exception.
148
149 This exception may be raised by FileDownloader objects when a file they
150 download is too small for what the server announced first, indicating
151 the connection was probably interrupted.
152 """
153 # Both in bytes
154 downloaded = None
155 expected = None
156
157 def __init__(self, downloaded, expected):
158 self.downloaded = downloaded
159 self.expected = expected
7b7759f5 160
4fa74b52
RG
161class FileDownloader(object):
162 """File Downloader class.
163
164 File downloader objects are the ones responsible of downloading the
165 actual video file and writing it to disk if the user has requested
166 it, among some other tasks. In most cases there should be one per
167 program. As, given a video URL, the downloader doesn't know how to
168 extract all the needed information, task that InfoExtractors do, it
169 has to pass the URL to one of them.
170
171 For this, file downloader objects have a method that allows
172 InfoExtractors to be registered in a given order. When it is passed
173 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
174 finds that reports being able to handle it. The InfoExtractor extracts
175 all the information about the video or videos the URL refers to, and
176 asks the FileDownloader to process the video information, possibly
177 downloading the video.
4fa74b52
RG
178
179 File downloaders accept a lot of parameters. In order not to saturate
180 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
181 options instead. These options are available through the params
182 attribute for the InfoExtractors to use. The FileDownloader also
183 registers itself as the downloader in charge for the InfoExtractors
184 that are added to it, so this is a "mutual registration".
4fa74b52
RG
185
186 Available options:
187
80066952
RG
188 username: Username for authentication purposes.
189 password: Password for authentication purposes.
190 usenetrc: Use netrc for authentication instead.
191 quiet: Do not print messages to stdout.
192 forceurl: Force printing final URL.
193 forcetitle: Force printing title.
194 forcethumbnail: Force printing thumbnail URL.
195 forcedescription: Force printing description.
196 simulate: Do not download the video files.
197 format: Video format code.
198 format_limit: Highest quality format to try.
199 outtmpl: Template for output names.
200 ignoreerrors: Do not stop on download errors.
201 ratelimit: Download speed limit, in bytes/sec.
202 nooverwrites: Prevent overwriting files.
203 retries: Number of times to retry for HTTP error 5xx
204 continuedl: Try to continue downloads if possible.
205 noprogress: Do not print the progress bar.
206 playliststart: Playlist item to start at.
8cc44341 207 playlistend: Playlist item to end at.
331ce0a0 208 logtostderr: Log messages to stderr instead of stdout.
4fa74b52
RG
209 """
210
d0a9affb 211 params = None
4fa74b52 212 _ies = []
65cd34c5 213 _pps = []
9bf386d7 214 _download_retcode = None
7d8d0612 215 _num_downloads = None
331ce0a0 216 _screen_file = None
4fa74b52
RG
217
218 def __init__(self, params):
1c5e2302 219 """Create a FileDownloader object with the given options."""
4fa74b52 220 self._ies = []
65cd34c5 221 self._pps = []
9bf386d7 222 self._download_retcode = 0
7d8d0612 223 self._num_downloads = 0
331ce0a0 224 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 225 self.params = params
4fa74b52
RG
226
227 @staticmethod
228 def pmkdir(filename):
229 """Create directory components in filename. Similar to Unix "mkdir -p"."""
230 components = filename.split(os.sep)
231 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
3af1e172 232 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
4fa74b52
RG
233 for dir in aggregate:
234 if not os.path.exists(dir):
235 os.mkdir(dir)
236
237 @staticmethod
238 def format_bytes(bytes):
239 if bytes is None:
240 return 'N/A'
8497c36d
RG
241 if type(bytes) is str:
242 bytes = float(bytes)
243 if bytes == 0.0:
4fa74b52
RG
244 exponent = 0
245 else:
8497c36d 246 exponent = long(math.log(bytes, 1024.0))
4fa74b52 247 suffix = 'bkMGTPEZY'[exponent]
4fa74b52
RG
248 converted = float(bytes) / float(1024**exponent)
249 return '%.2f%s' % (converted, suffix)
250
251 @staticmethod
252 def calc_percent(byte_counter, data_len):
253 if data_len is None:
254 return '---.-%'
255 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
256
257 @staticmethod
258 def calc_eta(start, now, total, current):
259 if total is None:
260 return '--:--'
261 dif = now - start
262 if current == 0 or dif < 0.001: # One millisecond
263 return '--:--'
264 rate = float(current) / dif
265 eta = long((float(total) - float(current)) / rate)
266 (eta_mins, eta_secs) = divmod(eta, 60)
267 if eta_mins > 99:
268 return '--:--'
269 return '%02d:%02d' % (eta_mins, eta_secs)
270
5121ef20 271 @staticmethod
4fa74b52
RG
272 def calc_speed(start, now, bytes):
273 dif = now - start
274 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 275 return '%10s' % '---b/s'
4fa74b52
RG
276 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
277
278 @staticmethod
279 def best_block_size(elapsed_time, bytes):
280 new_min = max(bytes / 2.0, 1.0)
281 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
282 if elapsed_time < 0.001:
e1f18b8a 283 return long(new_max)
4fa74b52
RG
284 rate = bytes / elapsed_time
285 if rate > new_max:
e1f18b8a 286 return long(new_max)
4fa74b52 287 if rate < new_min:
e1f18b8a
RG
288 return long(new_min)
289 return long(rate)
4fa74b52 290
acd3d842
RG
291 @staticmethod
292 def parse_bytes(bytestr):
293 """Parse a string indicating a byte quantity into a long integer."""
294 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
295 if matchobj is None:
296 return None
297 number = float(matchobj.group(1))
298 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
299 return long(round(number * multiplier))
300
4fa74b52
RG
301 def add_info_extractor(self, ie):
302 """Add an InfoExtractor object to the end of the list."""
303 self._ies.append(ie)
304 ie.set_downloader(self)
305
65cd34c5
RG
306 def add_post_processor(self, pp):
307 """Add a PostProcessor object to the end of the chain."""
308 self._pps.append(pp)
309 pp.set_downloader(self)
310
331ce0a0 311 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 312 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
313 try:
314 if not self.params.get('quiet', False):
331ce0a0
RG
315 terminator = [u'\n', u''][skip_eol]
316 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
317 self._screen_file.flush()
43ab0ca4
RG
318 except (UnicodeEncodeError), err:
319 if not ignore_encoding_errors:
320 raise
7e5cab67
RG
321
322 def to_stderr(self, message):
323 """Print message to stderr."""
eae2666c 324 print >>sys.stderr, message.encode(preferredencoding())
22899cea
RG
325
326 def fixed_template(self):
327 """Checks if the output template is fixed."""
d0a9affb 328 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 329
0086d1ec
RG
330 def trouble(self, message=None):
331 """Determine action to take when a download problem appears.
332
333 Depending on if the downloader has been configured to ignore
e5bf0f55 334 download errors or not, this method may throw an exception or
9bf386d7 335 not when errors are found, after printing the message.
0086d1ec
RG
336 """
337 if message is not None:
338 self.to_stderr(message)
d0a9affb 339 if not self.params.get('ignoreerrors', False):
e5bf0f55 340 raise DownloadError(message)
9bf386d7 341 self._download_retcode = 1
0086d1ec 342
acd3d842
RG
343 def slow_down(self, start_time, byte_counter):
344 """Sleep if the download speed is over the rate limit."""
d0a9affb 345 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
346 if rate_limit is None or byte_counter == 0:
347 return
348 now = time.time()
349 elapsed = now - start_time
350 if elapsed <= 0.0:
351 return
352 speed = float(byte_counter) / elapsed
353 if speed > rate_limit:
354 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
355
bafa5cd9
RG
356 def report_destination(self, filename):
357 """Report destination filename."""
331ce0a0 358 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
bafa5cd9
RG
359
360 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
361 """Report download progress."""
d9835247
RG
362 if self.params.get('noprogress', False):
363 return
331ce0a0 364 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 365 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
7db85b2c
RG
366
367 def report_resuming_byte(self, resume_len):
8a9f53be 368 """Report attempt to resume at given byte."""
331ce0a0 369 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
7db85b2c 370
7031008c 371 def report_retry(self, count, retries):
e86e9474 372 """Report retry in case of HTTP error 5xx"""
331ce0a0 373 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
7031008c 374
7db85b2c
RG
375 def report_file_already_downloaded(self, file_name):
376 """Report file has already been fully downloaded."""
43ab0ca4 377 try:
331ce0a0 378 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 379 except (UnicodeEncodeError), err:
331ce0a0 380 self.to_screen(u'[download] The file has already been downloaded')
7db85b2c
RG
381
382 def report_unable_to_resume(self):
383 """Report it was impossible to resume download."""
331ce0a0 384 self.to_screen(u'[download] Unable to resume')
bafa5cd9
RG
385
386 def report_finish(self):
387 """Report download finished."""
d9835247 388 if self.params.get('noprogress', False):
331ce0a0 389 self.to_screen(u'[download] Download completed')
d9835247 390 else:
331ce0a0 391 self.to_screen(u'')
df372a65
RG
392
393 def increment_downloads(self):
394 """Increment the ordinal that assigns a number to each file."""
395 self._num_downloads += 1
bafa5cd9 396
c8619e01
RG
397 def process_info(self, info_dict):
398 """Process a single dictionary returned by an InfoExtractor."""
c8619e01
RG
399 # Do nothing else if in simulate mode
400 if self.params.get('simulate', False):
cbfff4db
RG
401 # Forced printings
402 if self.params.get('forcetitle', False):
490fd7ae 403 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 404 if self.params.get('forceurl', False):
490fd7ae 405 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
7e58d568
RG
406 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
407 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
408 if self.params.get('forcedescription', False) and 'description' in info_dict:
409 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 410
9bf386d7 411 return
cbfff4db 412
c8619e01 413 try:
ad274509
RG
414 template_dict = dict(info_dict)
415 template_dict['epoch'] = unicode(long(time.time()))
1e47d226 416 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
ad274509 417 filename = self.params['outtmpl'] % template_dict
c8619e01 418 except (ValueError, KeyError), err:
38ed1344
RG
419 self.trouble(u'ERROR: invalid system charset or erroneous output template')
420 return
850ab765 421 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 422 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 423 return
7b7759f5 424
c8619e01
RG
425 try:
426 self.pmkdir(filename)
427 except (OSError, IOError), err:
db7e31b8 428 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
9bf386d7 429 return
7b7759f5 430
c8619e01 431 try:
e616ec0c 432 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
c8619e01 433 except (OSError, IOError), err:
73f4e7af 434 raise UnavailableVideoError
c8619e01 435 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
db7e31b8 436 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
9bf386d7 437 return
d69a1c91 438 except (ContentTooShortError, ), err:
db7e31b8 439 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
d69a1c91 440 return
7b7759f5 441
55e7c75e
RG
442 if success:
443 try:
444 self.post_process(filename, info_dict)
445 except (PostProcessingError), err:
db7e31b8 446 self.trouble(u'ERROR: postprocessing: %s' % str(err))
55e7c75e 447 return
c8619e01 448
4fa74b52
RG
449 def download(self, url_list):
450 """Download a given list of URLs."""
22899cea 451 if len(url_list) > 1 and self.fixed_template():
d0a9affb 452 raise SameFileError(self.params['outtmpl'])
22899cea 453
4fa74b52
RG
454 for url in url_list:
455 suitable_found = False
456 for ie in self._ies:
c8619e01 457 # Go to next InfoExtractor if not suitable
4fa74b52
RG
458 if not ie.suitable(url):
459 continue
c8619e01 460
4fa74b52
RG
461 # Suitable InfoExtractor found
462 suitable_found = True
c8619e01 463
6f21f686
RG
464 # Extract information from URL and process it
465 ie.extract(url)
65cd34c5 466
c8619e01 467 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 468 break
c8619e01 469
4fa74b52 470 if not suitable_found:
db7e31b8 471 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 472
9bf386d7 473 return self._download_retcode
65cd34c5
RG
474
475 def post_process(self, filename, ie_info):
476 """Run the postprocessing chain on the given file."""
477 info = dict(ie_info)
478 info['filepath'] = filename
479 for pp in self._pps:
480 info = pp.run(info)
481 if info is None:
482 break
4fa74b52 483
e616ec0c 484 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407
RG
485 self.report_destination(filename)
486
487 # Check for rtmpdump first
488 try:
489 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
490 except (OSError, IOError):
491 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
492 return False
493
494 # Download using rtmpdump. rtmpdump returns exit code 2 when
495 # the connection was interrumpted and resuming appears to be
496 # possible. This is part of rtmpdump's normal usage, AFAIK.
e616ec0c 497 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
1c1821f8
RG
498 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
499 while retval == 2 or retval == 1:
e616ec0c 500 prevsize = os.path.getsize(filename)
331ce0a0 501 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 502 time.sleep(5.0) # This seems to be needed
1c1821f8 503 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
e616ec0c
RG
504 cursize = os.path.getsize(filename)
505 if prevsize == cursize and retval == 1:
506 break
0487b407 507 if retval == 0:
331ce0a0 508 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
0487b407
RG
509 return True
510 else:
db7e31b8 511 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
512 return False
513
e616ec0c 514 def _do_download(self, filename, url, player_url):
0487b407
RG
515 # Attempt to download using rtmpdump
516 if url.startswith('rtmp'):
e616ec0c 517 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 518
55e7c75e 519 stream = None
9c457d2a 520 open_mode = 'wb'
7db85b2c 521 basic_request = urllib2.Request(url, None, std_headers)
4fa74b52 522 request = urllib2.Request(url, None, std_headers)
7db85b2c 523
9c457d2a 524 # Establish possible resume length
55e7c75e
RG
525 if os.path.isfile(filename):
526 resume_len = os.path.getsize(filename)
527 else:
528 resume_len = 0
9c457d2a
RG
529
530 # Request parameters in case of being able to resume
850ab765 531 if self.params.get('continuedl', False) and resume_len != 0:
7db85b2c
RG
532 self.report_resuming_byte(resume_len)
533 request.add_header('Range','bytes=%d-' % resume_len)
9c457d2a 534 open_mode = 'ab'
55e7c75e 535
7031008c
RG
536 count = 0
537 retries = self.params.get('retries', 0)
101e0d1e 538 while count <= retries:
7031008c
RG
539 # Establish connection
540 try:
541 data = urllib2.urlopen(request)
542 break
543 except (urllib2.HTTPError, ), err:
ac249f42 544 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 545 # Unexpected HTTP error
7031008c 546 raise
101e0d1e
RG
547 elif err.code == 416:
548 # Unable to resume (requested range not satisfiable)
549 try:
550 # Open the connection again without the range header
551 data = urllib2.urlopen(basic_request)
552 content_length = data.info()['Content-Length']
553 except (urllib2.HTTPError, ), err:
ac249f42 554 if err.code < 500 or err.code >= 600:
101e0d1e
RG
555 raise
556 else:
557 # Examine the reported length
268fb2bd 558 if (content_length is not None and
204c9398 559 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
560 # The file had already been fully downloaded.
561 # Explanation to the above condition: in issue #175 it was revealed that
562 # YouTube sometimes adds or removes a few bytes from the end of the file,
563 # changing the file size slightly and causing problems for some users. So
564 # I decided to implement a suggested change and consider the file
565 # completely downloaded if the file size differs less than 100 bytes from
566 # the one in the hard drive.
101e0d1e
RG
567 self.report_file_already_downloaded(filename)
568 return True
569 else:
570 # The length does not match, we start the download over
571 self.report_unable_to_resume()
572 open_mode = 'wb'
573 break
574 # Retry
575 count += 1
576 if count <= retries:
577 self.report_retry(count, retries)
578
579 if count > retries:
580 self.trouble(u'ERROR: giving up after %s retries' % retries)
581 return False
7db85b2c 582
4fa74b52
RG
583 data_len = data.info().get('Content-length', None)
584 data_len_str = self.format_bytes(data_len)
585 byte_counter = 0
586 block_size = 1024
587 start = time.time()
588 while True:
bafa5cd9 589 # Download and write
4fa74b52
RG
590 before = time.time()
591 data_block = data.read(block_size)
592 after = time.time()
593 data_block_len = len(data_block)
594 if data_block_len == 0:
595 break
596 byte_counter += data_block_len
55e7c75e
RG
597
598 # Open file just in time
599 if stream is None:
600 try:
31bcb480 601 (stream, filename) = sanitize_open(filename, open_mode)
55e7c75e
RG
602 self.report_destination(filename)
603 except (OSError, IOError), err:
db7e31b8 604 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 605 return False
131efd1a
RG
606 try:
607 stream.write(data_block)
608 except (IOError, OSError), err:
d67e0974
RG
609 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
610 return False
4fa74b52
RG
611 block_size = self.best_block_size(after - before, data_block_len)
612
55e7c75e
RG
613 # Progress message
614 percent_str = self.calc_percent(byte_counter, data_len)
615 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
616 speed_str = self.calc_speed(start, time.time(), byte_counter)
617 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
618
acd3d842
RG
619 # Apply rate limit
620 self.slow_down(start, byte_counter)
621
bafa5cd9 622 self.report_finish()
4fa74b52 623 if data_len is not None and str(byte_counter) != data_len:
d69a1c91 624 raise ContentTooShortError(byte_counter, long(data_len))
55e7c75e 625 return True
4fa74b52
RG
626
627class InfoExtractor(object):
628 """Information Extractor class.
629
630 Information extractors are the classes that, given a URL, extract
631 information from the video (or videos) the URL refers to. This
632 information includes the real video URL, the video title and simplified
2851b2ca
RG
633 title, author and others. The information is stored in a dictionary
634 which is then passed to the FileDownloader. The FileDownloader
635 processes this information possibly downloading the video to the file
636 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
637 the following fields:
638
639 id: Video identifier.
640 url: Final video URL.
641 uploader: Nickname of the video uploader.
642 title: Literal title.
643 stitle: Simplified title.
644 ext: Video filename extension.
6ba562b0 645 format: Video format.
e616ec0c 646 player_url: SWF Player URL (may be None).
4fa74b52 647
7e58d568
RG
648 The following fields are optional. Their primary purpose is to allow
649 youtube-dl to serve as the backend for a video search function, such
650 as the one in youtube2mp3. They are only used when their respective
651 forced printing functions are called:
652
653 thumbnail: Full URL to a video thumbnail image.
654 description: One-line video description.
655
4fa74b52
RG
656 Subclasses of this one should re-define the _real_initialize() and
657 _real_extract() methods, as well as the suitable() static method.
658 Probably, they should also be instantiated and added to the main
659 downloader.
660 """
661
662 _ready = False
663 _downloader = None
664
665 def __init__(self, downloader=None):
666 """Constructor. Receives an optional downloader."""
667 self._ready = False
668 self.set_downloader(downloader)
669
670 @staticmethod
671 def suitable(url):
672 """Receives a URL and returns True if suitable for this IE."""
020f7150 673 return False
4fa74b52
RG
674
675 def initialize(self):
1c5e2302 676 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
677 if not self._ready:
678 self._real_initialize()
679 self._ready = True
680
681 def extract(self, url):
682 """Extracts URL information and returns it in list of dicts."""
683 self.initialize()
684 return self._real_extract(url)
685
686 def set_downloader(self, downloader):
687 """Sets the downloader for this IE."""
688 self._downloader = downloader
689
4fa74b52
RG
690 def _real_initialize(self):
691 """Real initialization process. Redefine in subclasses."""
692 pass
693
694 def _real_extract(self, url):
695 """Real extraction process. Redefine in subclasses."""
696 pass
697
698class YoutubeIE(InfoExtractor):
699 """Information extractor for youtube.com."""
700
a949a3ae 701 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 702 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 703 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 704 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 705 _NETRC_MACHINE = 'youtube'
497cd3e6
RG
706 # Listed in order of quality
707 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
7b7759f5 708 _video_extensions = {
709 '13': '3gp',
710 '17': 'mp4',
711 '18': 'mp4',
712 '22': 'mp4',
d9bc015b 713 '37': 'mp4',
9e9647d9 714 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a
RG
715 '43': 'webm',
716 '45': 'webm',
7b7759f5 717 }
4fa74b52 718
020f7150
RG
719 @staticmethod
720 def suitable(url):
721 return (re.match(YoutubeIE._VALID_URL, url) is not None)
722
72ac78b8
RG
723 def report_lang(self):
724 """Report attempt to set language."""
331ce0a0 725 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 726
bafa5cd9
RG
727 def report_login(self):
728 """Report attempt to log in."""
331ce0a0 729 self._downloader.to_screen(u'[youtube] Logging in')
bafa5cd9
RG
730
731 def report_age_confirmation(self):
732 """Report attempt to confirm age."""
331ce0a0 733 self._downloader.to_screen(u'[youtube] Confirming age')
bafa5cd9 734
e616ec0c
RG
735 def report_video_webpage_download(self, video_id):
736 """Report attempt to download video webpage."""
331ce0a0 737 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
e616ec0c 738
71b7300e
RG
739 def report_video_info_webpage_download(self, video_id):
740 """Report attempt to download video info webpage."""
331ce0a0 741 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
bafa5cd9
RG
742
743 def report_information_extraction(self, video_id):
744 """Report attempt to extract video information."""
331ce0a0 745 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
bafa5cd9 746
7b7759f5 747 def report_unavailable_format(self, video_id, format):
748 """Report extracted video URL."""
331ce0a0 749 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
7b7759f5 750
0487b407
RG
751 def report_rtmp_download(self):
752 """Indicate the download will use the RTMP protocol."""
331ce0a0 753 self._downloader.to_screen(u'[youtube] RTMP download detected')
0487b407 754
4fa74b52
RG
755 def _real_initialize(self):
756 if self._downloader is None:
757 return
758
759 username = None
760 password = None
d0a9affb 761 downloader_params = self._downloader.params
4fa74b52
RG
762
763 # Attempt to use provided username and password or .netrc data
764 if downloader_params.get('username', None) is not None:
765 username = downloader_params['username']
766 password = downloader_params['password']
767 elif downloader_params.get('usenetrc', False):
768 try:
769 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
770 if info is not None:
771 username = info[0]
772 password = info[2]
773 else:
774 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
775 except (IOError, netrc.NetrcParseError), err:
6f21f686 776 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
777 return
778
72ac78b8 779 # Set language
cc109403 780 request = urllib2.Request(self._LANG_URL, None, std_headers)
72ac78b8
RG
781 try:
782 self.report_lang()
783 urllib2.urlopen(request).read()
784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 785 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
786 return
787
cc109403
RG
788 # No authentication to be performed
789 if username is None:
790 return
791
4fa74b52 792 # Log in
9fcd8355
RG
793 login_form = {
794 'current_form': 'loginForm',
4fa74b52
RG
795 'next': '/',
796 'action_login': 'Log In',
797 'username': username,
9fcd8355
RG
798 'password': password,
799 }
4fa74b52
RG
800 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
801 try:
bafa5cd9 802 self.report_login()
4fa74b52
RG
803 login_results = urllib2.urlopen(request).read()
804 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 805 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
806 return
807 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 808 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52
RG
809 return
810
811 # Confirm age
9fcd8355
RG
812 age_form = {
813 'next_url': '/',
814 'action_confirm': 'Confirm',
815 }
4fa74b52
RG
816 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
817 try:
bafa5cd9 818 self.report_age_confirmation()
4fa74b52
RG
819 age_results = urllib2.urlopen(request).read()
820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 821 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 822 return
4fa74b52
RG
823
824 def _real_extract(self, url):
825 # Extract video id from URL
020f7150 826 mobj = re.match(self._VALID_URL, url)
4fa74b52 827 if mobj is None:
147753eb 828 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 829 return
4fa74b52
RG
830 video_id = mobj.group(2)
831
497cd3e6
RG
832 # Get video webpage
833 self.report_video_webpage_download(video_id)
834 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
835 try:
836 video_webpage = urllib2.urlopen(request).read()
837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
838 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
839 return
968aa884 840
497cd3e6
RG
841 # Attempt to extract SWF player URL
842 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
843 if mobj is not None:
844 player_url = mobj.group(1)
845 else:
846 player_url = None
847
848 # Get video info
849 self.report_video_info_webpage_download(video_id)
850 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
851 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
852 % (video_id, el_type))
853 request = urllib2.Request(video_info_url, None, std_headers)
e616ec0c 854 try:
497cd3e6
RG
855 video_info_webpage = urllib2.urlopen(request).read()
856 video_info = parse_qs(video_info_webpage)
857 if 'token' in video_info:
858 break
e616ec0c 859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 860 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 861 return
f95f29fd
RG
862 if 'token' not in video_info:
863 if 'reason' in video_info:
8e686771 864 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
865 else:
866 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
867 return
868
869 # Start extracting information
497cd3e6
RG
870 self.report_information_extraction(video_id)
871
872 # uploader
873 if 'author' not in video_info:
874 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
875 return
876 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 877
497cd3e6
RG
878 # title
879 if 'title' not in video_info:
880 self._downloader.trouble(u'ERROR: unable to extract video title')
881 return
882 video_title = urllib.unquote_plus(video_info['title'][0])
883 video_title = video_title.decode('utf-8')
884 video_title = sanitize_title(video_title)
885
886 # simplified title
887 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
888 simple_title = simple_title.strip(ur'_')
889
890 # thumbnail image
891 if 'thumbnail_url' not in video_info:
892 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
893 video_thumbnail = ''
894 else: # don't panic if we can't find it
895 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
896
897 # description
898 video_description = 'No description available.'
899 if self._downloader.params.get('forcedescription', False):
900 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
e616ec0c 901 if mobj is not None:
497cd3e6
RG
902 video_description = mobj.group(1)
903
5ce7d172
RG
904 # token
905 video_token = urllib.unquote_plus(video_info['token'][0])
906
497cd3e6 907 # Decide which formats to download
2e3a32e4 908 requested_format = self._downloader.params.get('format', None)
5ce7d172 909 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
2e3a32e4 910
5ce7d172 911 if 'fmt_url_map' in video_info:
497cd3e6
RG
912 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
913 format_limit = self._downloader.params.get('format_limit', None)
914 if format_limit is not None and format_limit in self._available_formats:
915 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 916 else:
497cd3e6
RG
917 format_list = self._available_formats
918 existing_formats = [x for x in format_list if x in url_map]
919 if len(existing_formats) == 0:
920 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 921 return
497cd3e6 922 if requested_format is None:
5ce7d172 923 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
497cd3e6 924 elif requested_format == '-1':
5ce7d172 925 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
497cd3e6 926 else:
5ce7d172 927 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
2e3a32e4 928
497cd3e6
RG
929 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
930 self.report_rtmp_download()
931 video_url_list = [(None, video_info['conn'][0])]
2e3a32e4 932
497cd3e6
RG
933 else:
934 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
935 return
7b7759f5 936
497cd3e6
RG
937 for format_param, video_real_url in video_url_list:
938 # At this point we have a new video
939 self._downloader.increment_downloads()
940
941 # Extension
942 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 943
497cd3e6 944 # Find the video URL in fmt_url_map or conn paramters
968aa884 945 try:
7b7759f5 946 # Process video information
947 self._downloader.process_info({
948 'id': video_id.decode('utf-8'),
949 'url': video_real_url.decode('utf-8'),
950 'uploader': video_uploader.decode('utf-8'),
951 'title': video_title,
952 'stitle': simple_title,
953 'ext': video_extension.decode('utf-8'),
6ba562b0 954 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568
RG
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description.decode('utf-8'),
e616ec0c 957 'player_url': player_url,
7b7759f5 958 })
497cd3e6 959 except UnavailableVideoError, err:
5ce7d172 960 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
42bcd27d 961
4fa74b52 962
020f7150
RG
963class MetacafeIE(InfoExtractor):
964 """Information Extractor for metacafe.com."""
965
966 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 967 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 968 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150
RG
969 _youtube_ie = None
970
971 def __init__(self, youtube_ie, downloader=None):
972 InfoExtractor.__init__(self, downloader)
973 self._youtube_ie = youtube_ie
974
975 @staticmethod
976 def suitable(url):
977 return (re.match(MetacafeIE._VALID_URL, url) is not None)
978
979 def report_disclaimer(self):
980 """Report disclaimer retrieval."""
331ce0a0 981 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
982
983 def report_age_confirmation(self):
984 """Report attempt to confirm age."""
331ce0a0 985 self._downloader.to_screen(u'[metacafe] Confirming age')
020f7150
RG
986
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
331ce0a0 989 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
020f7150
RG
990
991 def report_extraction(self, video_id):
992 """Report information extraction."""
331ce0a0 993 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
994
995 def _real_initialize(self):
996 # Retrieve disclaimer
997 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
998 try:
999 self.report_disclaimer()
1000 disclaimer = urllib2.urlopen(request).read()
1001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1002 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1003 return
1004
1005 # Confirm age
1006 disclaimer_form = {
2546e767 1007 'filters': '0',
020f7150
RG
1008 'submit': "Continue - I'm over 18",
1009 }
dbccb6cd 1010 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
020f7150
RG
1011 try:
1012 self.report_age_confirmation()
1013 disclaimer = urllib2.urlopen(request).read()
1014 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1015 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150
RG
1016 return
1017
1018 def _real_extract(self, url):
1019 # Extract id and simplified title from URL
1020 mobj = re.match(self._VALID_URL, url)
1021 if mobj is None:
147753eb 1022 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1023 return
020f7150
RG
1024
1025 video_id = mobj.group(1)
1026
1027 # Check if video comes from YouTube
1028 mobj2 = re.match(r'^yt-(.*)$', video_id)
1029 if mobj2 is not None:
6f21f686
RG
1030 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1031 return
020f7150 1032
df372a65 1033 # At this point we have a new video
9bf7fa52 1034 self._downloader.increment_downloads()
df372a65 1035
020f7150 1036 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1037
1038 # Retrieve video webpage to extract further information
1039 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1040 try:
1041 self.report_download_webpage(video_id)
1042 webpage = urllib2.urlopen(request).read()
1043 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1044 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1045 return
020f7150
RG
1046
1047 # Extract URL, uploader and title from webpage
1048 self.report_extraction(video_id)
18963a36 1049 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1050 if mobj is not None:
1051 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1052 video_extension = mediaURL[-3:]
c6c555cf
RG
1053
1054 # Extract gdaKey if available
1055 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1056 if mobj is None:
1057 video_url = mediaURL
1058 else:
1059 gdaKey = mobj.group(1)
1060 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1061 else:
c6c555cf
RG
1062 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1063 if mobj is None:
1064 self._downloader.trouble(u'ERROR: unable to extract media URL')
1065 return
1066 vardict = parse_qs(mobj.group(1))
1067 if 'mediaData' not in vardict:
1068 self._downloader.trouble(u'ERROR: unable to extract media URL')
1069 return
1070 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1071 if mobj is None:
1072 self._downloader.trouble(u'ERROR: unable to extract media URL')
1073 return
6b57e8c5
RG
1074 mediaURL = mobj.group(1).replace('\\/', '/')
1075 video_extension = mediaURL[-3:]
1076 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1077
2546e767 1078 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1079 if mobj is None:
147753eb 1080 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1081 return
020f7150 1082 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1083 video_title = sanitize_title(video_title)
020f7150 1084
29f07568 1085 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1086 if mobj is None:
147753eb 1087 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1088 return
dbccb6cd 1089 video_uploader = mobj.group(1)
020f7150 1090
42bcd27d 1091 try:
1092 # Process video information
1093 self._downloader.process_info({
1094 'id': video_id.decode('utf-8'),
1095 'url': video_url.decode('utf-8'),
1096 'uploader': video_uploader.decode('utf-8'),
1097 'title': video_title,
1098 'stitle': simple_title,
1099 'ext': video_extension.decode('utf-8'),
6ba562b0 1100 'format': u'NA',
e616ec0c 1101 'player_url': None,
42bcd27d 1102 })
73f4e7af
RG
1103 except UnavailableVideoError:
1104 self._downloader.trouble(u'ERROR: unable to download video')
020f7150 1105
25af2bce 1106
4135fa45
WB
1107class DailymotionIE(InfoExtractor):
1108 """Information Extractor for Dailymotion"""
1109
1110 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
4135fa45
WB
1111
1112 def __init__(self, downloader=None):
1113 InfoExtractor.__init__(self, downloader)
1114
1115 @staticmethod
1116 def suitable(url):
1117 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1118
4135fa45
WB
1119 def report_download_webpage(self, video_id):
1120 """Report webpage download."""
331ce0a0 1121 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
4135fa45
WB
1122
1123 def report_extraction(self, video_id):
1124 """Report information extraction."""
331ce0a0 1125 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45
WB
1126
1127 def _real_initialize(self):
1128 return
1129
4135fa45
WB
1130 def _real_extract(self, url):
1131 # Extract id and simplified title from URL
1132 mobj = re.match(self._VALID_URL, url)
1133 if mobj is None:
1134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1135 return
1136
df372a65 1137 # At this point we have a new video
9bf7fa52 1138 self._downloader.increment_downloads()
4135fa45
WB
1139 video_id = mobj.group(1)
1140
1141 simple_title = mobj.group(2).decode('utf-8')
1142 video_extension = 'flv'
1143
1144 # Retrieve video webpage to extract further information
1145 request = urllib2.Request(url)
1146 try:
1147 self.report_download_webpage(video_id)
1148 webpage = urllib2.urlopen(request).read()
1149 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1150 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1151 return
1152
1153 # Extract URL, uploader and title from webpage
1154 self.report_extraction(video_id)
1155 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1156 if mobj is None:
1157 self._downloader.trouble(u'ERROR: unable to extract media URL')
1158 return
1159 mediaURL = urllib.unquote(mobj.group(1))
1160
1161 # if needed add http://www.dailymotion.com/ if relative URL
1162
1163 video_url = mediaURL
1164
1165 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1166 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1167 if mobj is None:
1168 self._downloader.trouble(u'ERROR: unable to extract title')
1169 return
1170 video_title = mobj.group(1).decode('utf-8')
1171 video_title = sanitize_title(video_title)
1172
33407be7 1173 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
4135fa45
WB
1174 if mobj is None:
1175 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1176 return
1177 video_uploader = mobj.group(1)
1178
1179 try:
1180 # Process video information
1181 self._downloader.process_info({
1182 'id': video_id.decode('utf-8'),
1183 'url': video_url.decode('utf-8'),
1184 'uploader': video_uploader.decode('utf-8'),
1185 'title': video_title,
1186 'stitle': simple_title,
1187 'ext': video_extension.decode('utf-8'),
1188 'format': u'NA',
1189 'player_url': None,
1190 })
73f4e7af
RG
1191 except UnavailableVideoError:
1192 self._downloader.trouble(u'ERROR: unable to download video')
4135fa45 1193
49c0028a 1194class GoogleIE(InfoExtractor):
1195 """Information extractor for video.google.com."""
1196
490fd7ae 1197 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
49c0028a 1198
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1201
1202 @staticmethod
1203 def suitable(url):
1204 return (re.match(GoogleIE._VALID_URL, url) is not None)
1205
1206 def report_download_webpage(self, video_id):
1207 """Report webpage download."""
331ce0a0 1208 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1209
1210 def report_extraction(self, video_id):
1211 """Report information extraction."""
331ce0a0 1212 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1213
1214 def _real_initialize(self):
1215 return
1216
1217 def _real_extract(self, url):
1218 # Extract id from URL
1219 mobj = re.match(self._VALID_URL, url)
1220 if mobj is None:
1221 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1222 return
1223
df372a65 1224 # At this point we have a new video
9bf7fa52 1225 self._downloader.increment_downloads()
49c0028a 1226 video_id = mobj.group(1)
1227
1228 video_extension = 'mp4'
1229
1230 # Retrieve video webpage to extract further information
490fd7ae 1231 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1232 try:
1233 self.report_download_webpage(video_id)
1234 webpage = urllib2.urlopen(request).read()
1235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1236 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1237 return
1238
1239 # Extract URL, uploader, and title from webpage
1240 self.report_extraction(video_id)
490fd7ae
RG
1241 mobj = re.search(r"download_url:'([^']+)'", webpage)
1242 if mobj is None:
1243 video_extension = 'flv'
1244 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1245 if mobj is None:
1246 self._downloader.trouble(u'ERROR: unable to extract media URL')
1247 return
1248 mediaURL = urllib.unquote(mobj.group(1))
1249 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1250 mediaURL = mediaURL.replace('\\x26', '\x26')
1251
1252 video_url = mediaURL
1253
1254 mobj = re.search(r'<title>(.*)</title>', webpage)
1255 if mobj is None:
1256 self._downloader.trouble(u'ERROR: unable to extract title')
1257 return
1258 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1259 video_title = sanitize_title(video_title)
31cbdaaf 1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1261
7e58d568
RG
1262 # Extract video description
1263 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1264 if mobj is None:
1265 self._downloader.trouble(u'ERROR: unable to extract video description')
1266 return
1267 video_description = mobj.group(1).decode('utf-8')
1268 if not video_description:
1269 video_description = 'No description available.'
1270
1271 # Extract video thumbnail
1272 if self._downloader.params.get('forcethumbnail', False):
1273 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1274 try:
1275 webpage = urllib2.urlopen(request).read()
1276 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1277 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1278 return
1279 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1280 if mobj is None:
1281 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1282 return
1283 video_thumbnail = mobj.group(1)
1284 else: # we need something to pass to process_info
1285 video_thumbnail = ''
1286
1287
49c0028a 1288 try:
1289 # Process video information
1290 self._downloader.process_info({
1291 'id': video_id.decode('utf-8'),
1292 'url': video_url.decode('utf-8'),
6ba562b0 1293 'uploader': u'NA',
490fd7ae 1294 'title': video_title,
31cbdaaf 1295 'stitle': simple_title,
49c0028a 1296 'ext': video_extension.decode('utf-8'),
6ba562b0 1297 'format': u'NA',
e616ec0c 1298 'player_url': None,
49c0028a 1299 })
73f4e7af
RG
1300 except UnavailableVideoError:
1301 self._downloader.trouble(u'ERROR: unable to download video')
49c0028a 1302
1303
1304class PhotobucketIE(InfoExtractor):
1305 """Information extractor for photobucket.com."""
1306
1307 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1308
1309 def __init__(self, downloader=None):
1310 InfoExtractor.__init__(self, downloader)
1311
1312 @staticmethod
1313 def suitable(url):
1314 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1315
1316 def report_download_webpage(self, video_id):
1317 """Report webpage download."""
331ce0a0 1318 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1319
1320 def report_extraction(self, video_id):
1321 """Report information extraction."""
331ce0a0 1322 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1323
1324 def _real_initialize(self):
1325 return
1326
1327 def _real_extract(self, url):
1328 # Extract id from URL
1329 mobj = re.match(self._VALID_URL, url)
1330 if mobj is None:
1331 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1332 return
1333
df372a65 1334 # At this point we have a new video
9bf7fa52 1335 self._downloader.increment_downloads()
49c0028a 1336 video_id = mobj.group(1)
1337
1338 video_extension = 'flv'
1339
1340 # Retrieve video webpage to extract further information
1341 request = urllib2.Request(url)
1342 try:
1343 self.report_download_webpage(video_id)
1344 webpage = urllib2.urlopen(request).read()
1345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1346 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1347 return
1348
1349 # Extract URL, uploader, and title from webpage
1350 self.report_extraction(video_id)
1351 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1352 if mobj is None:
1353 self._downloader.trouble(u'ERROR: unable to extract media URL')
1354 return
1355 mediaURL = urllib.unquote(mobj.group(1))
1356
1357 video_url = mediaURL
1358
1359 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1360 if mobj is None:
1361 self._downloader.trouble(u'ERROR: unable to extract title')
1362 return
1363 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1364 video_title = sanitize_title(video_title)
31cbdaaf 1365 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1366
1367 video_uploader = mobj.group(2).decode('utf-8')
1368
1369 try:
1370 # Process video information
1371 self._downloader.process_info({
1372 'id': video_id.decode('utf-8'),
1373 'url': video_url.decode('utf-8'),
490fd7ae
RG
1374 'uploader': video_uploader,
1375 'title': video_title,
31cbdaaf 1376 'stitle': simple_title,
490fd7ae 1377 'ext': video_extension.decode('utf-8'),
6ba562b0 1378 'format': u'NA',
e616ec0c 1379 'player_url': None,
490fd7ae 1380 })
73f4e7af
RG
1381 except UnavailableVideoError:
1382 self._downloader.trouble(u'ERROR: unable to download video')
490fd7ae
RG
1383
1384
61945318
RG
1385class YahooIE(InfoExtractor):
1386 """Information extractor for video.yahoo.com."""
1387
1388 # _VALID_URL matches all Yahoo! Video URLs
1389 # _VPAGE_URL matches only the extractable '/watch/' URLs
1390 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1391 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1392
1393 def __init__(self, downloader=None):
1394 InfoExtractor.__init__(self, downloader)
1395
1396 @staticmethod
1397 def suitable(url):
1398 return (re.match(YahooIE._VALID_URL, url) is not None)
1399
1400 def report_download_webpage(self, video_id):
1401 """Report webpage download."""
331ce0a0 1402 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1403
1404 def report_extraction(self, video_id):
1405 """Report information extraction."""
331ce0a0 1406 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318
RG
1407
1408 def _real_initialize(self):
1409 return
1410
df372a65 1411 def _real_extract(self, url, new_video=True):
61945318
RG
1412 # Extract ID from URL
1413 mobj = re.match(self._VALID_URL, url)
1414 if mobj is None:
1415 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1416 return
1417
df372a65 1418 # At this point we have a new video
9bf7fa52 1419 self._downloader.increment_downloads()
61945318
RG
1420 video_id = mobj.group(2)
1421 video_extension = 'flv'
1422
1423 # Rewrite valid but non-extractable URLs as
1424 # extractable English language /watch/ URLs
1425 if re.match(self._VPAGE_URL, url) is None:
1426 request = urllib2.Request(url)
1427 try:
1428 webpage = urllib2.urlopen(request).read()
1429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1430 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1431 return
1432
1433 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1434 if mobj is None:
1435 self._downloader.trouble(u'ERROR: Unable to extract id field')
1436 return
1437 yahoo_id = mobj.group(1)
1438
1439 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1440 if mobj is None:
1441 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1442 return
1443 yahoo_vid = mobj.group(1)
1444
1445 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1446 return self._real_extract(url, new_video=False)
61945318
RG
1447
1448 # Retrieve video webpage to extract further information
1449 request = urllib2.Request(url)
1450 try:
1451 self.report_download_webpage(video_id)
1452 webpage = urllib2.urlopen(request).read()
1453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1455 return
1456
1457 # Extract uploader and title from webpage
1458 self.report_extraction(video_id)
1459 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1460 if mobj is None:
1461 self._downloader.trouble(u'ERROR: unable to extract video title')
1462 return
1463 video_title = mobj.group(1).decode('utf-8')
1464 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1465
1466 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1467 if mobj is None:
1468 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1469 return
1470 video_uploader = mobj.group(1).decode('utf-8')
1471
7e58d568
RG
1472 # Extract video thumbnail
1473 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1474 if mobj is None:
1475 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1476 return
1477 video_thumbnail = mobj.group(1).decode('utf-8')
1478
1479 # Extract video description
1480 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1481 if mobj is None:
1482 self._downloader.trouble(u'ERROR: unable to extract video description')
1483 return
1484 video_description = mobj.group(1).decode('utf-8')
1485 if not video_description: video_description = 'No description available.'
1486
61945318
RG
1487 # Extract video height and width
1488 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1489 if mobj is None:
1490 self._downloader.trouble(u'ERROR: unable to extract video height')
1491 return
1492 yv_video_height = mobj.group(1)
1493
1494 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1495 if mobj is None:
1496 self._downloader.trouble(u'ERROR: unable to extract video width')
1497 return
1498 yv_video_width = mobj.group(1)
1499
1500 # Retrieve video playlist to extract media URL
1501 # I'm not completely sure what all these options are, but we
1502 # seem to need most of them, otherwise the server sends a 401.
1503 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1504 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1505 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1506 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1507 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1508 try:
1509 self.report_download_webpage(video_id)
1510 webpage = urllib2.urlopen(request).read()
1511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1513 return
1514
1515 # Extract media URL from playlist XML
1516 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1517 if mobj is None:
1518 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1519 return
1520 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1521 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1522
1523 try:
1524 # Process video information
1525 self._downloader.process_info({
1526 'id': video_id.decode('utf-8'),
1527 'url': video_url,
1528 'uploader': video_uploader,
1529 'title': video_title,
1530 'stitle': simple_title,
1531 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1532 'thumbnail': video_thumbnail.decode('utf-8'),
1533 'description': video_description,
1534 'thumbnail': video_thumbnail,
1535 'description': video_description,
e616ec0c 1536 'player_url': None,
61945318 1537 })
73f4e7af
RG
1538 except UnavailableVideoError:
1539 self._downloader.trouble(u'ERROR: unable to download video')
61945318
RG
1540
1541
490fd7ae
RG
1542class GenericIE(InfoExtractor):
1543 """Generic last-resort information extractor."""
1544
1545 def __init__(self, downloader=None):
1546 InfoExtractor.__init__(self, downloader)
1547
1548 @staticmethod
1549 def suitable(url):
1550 return True
1551
1552 def report_download_webpage(self, video_id):
1553 """Report webpage download."""
331ce0a0
RG
1554 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1555 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
1556
1557 def report_extraction(self, video_id):
1558 """Report information extraction."""
331ce0a0 1559 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae
RG
1560
1561 def _real_initialize(self):
1562 return
1563
1564 def _real_extract(self, url):
df372a65 1565 # At this point we have a new video
9bf7fa52 1566 self._downloader.increment_downloads()
df372a65 1567
490fd7ae
RG
1568 video_id = url.split('/')[-1]
1569 request = urllib2.Request(url)
1570 try:
1571 self.report_download_webpage(video_id)
1572 webpage = urllib2.urlopen(request).read()
1573 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1574 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1575 return
1576 except ValueError, err:
1577 # since this is the last-resort InfoExtractor, if
1578 # this error is thrown, it'll be thrown here
1579 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1580 return
1581
1582 # Start with something easy: JW Player in SWFObject
1583 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1584 if mobj is None:
1585 # Broaden the search a little bit
1586 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1587 if mobj is None:
1588 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1589 return
1590
1591 # It's possible that one of the regexes
1592 # matched, but returned an empty group:
1593 if mobj.group(1) is None:
1594 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1595 return
1596
1597 video_url = urllib.unquote(mobj.group(1))
1598 video_id = os.path.basename(video_url)
1599
1600 # here's a fun little line of code for you:
1601 video_extension = os.path.splitext(video_id)[1][1:]
1602 video_id = os.path.splitext(video_id)[0]
1603
1604 # it's tempting to parse this further, but you would
1605 # have to take into account all the variations like
1606 # Video Title - Site Name
1607 # Site Name | Video Title
1608 # Video Title - Tagline | Site Name
1609 # and so on and so forth; it's just not practical
1610 mobj = re.search(r'<title>(.*)</title>', webpage)
1611 if mobj is None:
1612 self._downloader.trouble(u'ERROR: unable to extract title')
1613 return
1614 video_title = mobj.group(1).decode('utf-8')
1615 video_title = sanitize_title(video_title)
31cbdaaf 1616 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
490fd7ae
RG
1617
1618 # video uploader is domain name
1619 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1620 if mobj is None:
1621 self._downloader.trouble(u'ERROR: unable to extract title')
1622 return
1623 video_uploader = mobj.group(1).decode('utf-8')
1624
1625 try:
1626 # Process video information
1627 self._downloader.process_info({
1628 'id': video_id.decode('utf-8'),
1629 'url': video_url.decode('utf-8'),
1630 'uploader': video_uploader,
1631 'title': video_title,
31cbdaaf 1632 'stitle': simple_title,
49c0028a 1633 'ext': video_extension.decode('utf-8'),
6ba562b0 1634 'format': u'NA',
e616ec0c 1635 'player_url': None,
49c0028a 1636 })
73f4e7af
RG
1637 except UnavailableVideoError, err:
1638 self._downloader.trouble(u'ERROR: unable to download video')
49c0028a 1639
1640
25af2bce
RG
1641class YoutubeSearchIE(InfoExtractor):
1642 """Information Extractor for YouTube search queries."""
1643 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1644 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1645 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 1646 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 1647 _youtube_ie = None
fd9288c3 1648 _max_youtube_results = 1000
25af2bce 1649
f995f712 1650 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
1651 InfoExtractor.__init__(self, downloader)
1652 self._youtube_ie = youtube_ie
1653
1654 @staticmethod
1655 def suitable(url):
1656 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1657
1658 def report_download_page(self, query, pagenum):
1659 """Report attempt to download playlist page with given number."""
490fd7ae 1660 query = query.decode(preferredencoding())
331ce0a0 1661 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
1662
1663 def _real_initialize(self):
1664 self._youtube_ie.initialize()
1665
1666 def _real_extract(self, query):
1667 mobj = re.match(self._VALID_QUERY, query)
1668 if mobj is None:
147753eb 1669 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 1670 return
25af2bce
RG
1671
1672 prefix, query = query.split(':')
1673 prefix = prefix[8:]
490fd7ae 1674 query = query.encode('utf-8')
f995f712 1675 if prefix == '':
6f21f686
RG
1676 self._download_n_results(query, 1)
1677 return
f995f712 1678 elif prefix == 'all':
6f21f686
RG
1679 self._download_n_results(query, self._max_youtube_results)
1680 return
f995f712 1681 else:
25af2bce 1682 try:
e1f18b8a 1683 n = long(prefix)
25af2bce 1684 if n <= 0:
147753eb 1685 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 1686 return
257453b9 1687 elif n > self._max_youtube_results:
6f21f686 1688 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 1689 n = self._max_youtube_results
6f21f686
RG
1690 self._download_n_results(query, n)
1691 return
e1f18b8a 1692 except ValueError: # parsing prefix as integer fails
6f21f686
RG
1693 self._download_n_results(query, 1)
1694 return
25af2bce
RG
1695
1696 def _download_n_results(self, query, n):
1697 """Downloads a specified number of results for a query"""
1698
1699 video_ids = []
1700 already_seen = set()
1701 pagenum = 1
1702
1703 while True:
1704 self.report_download_page(query, pagenum)
a9633f14 1705 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
25af2bce
RG
1706 request = urllib2.Request(result_url, None, std_headers)
1707 try:
1708 page = urllib2.urlopen(request).read()
1709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1710 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 1711 return
25af2bce
RG
1712
1713 # Extract video identifiers
1714 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1715 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1716 if video_id not in already_seen:
1717 video_ids.append(video_id)
1718 already_seen.add(video_id)
1719 if len(video_ids) == n:
1720 # Specified n videos reached
25af2bce 1721 for id in video_ids:
6f21f686
RG
1722 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1723 return
25af2bce 1724
304a4d85 1725 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 1726 for id in video_ids:
6f21f686
RG
1727 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1728 return
25af2bce
RG
1729
1730 pagenum = pagenum + 1
1731
7e58d568
RG
1732class GoogleSearchIE(InfoExtractor):
1733 """Information Extractor for Google Video search queries."""
1734 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1735 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1736 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1737 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1738 _google_ie = None
1739 _max_google_results = 1000
1740
1741 def __init__(self, google_ie, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1743 self._google_ie = google_ie
1744
1745 @staticmethod
1746 def suitable(url):
1747 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1748
1749 def report_download_page(self, query, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 query = query.decode(preferredencoding())
331ce0a0 1752 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
1753
1754 def _real_initialize(self):
1755 self._google_ie.initialize()
1756
1757 def _real_extract(self, query):
1758 mobj = re.match(self._VALID_QUERY, query)
1759 if mobj is None:
1760 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1761 return
1762
1763 prefix, query = query.split(':')
1764 prefix = prefix[8:]
1765 query = query.encode('utf-8')
1766 if prefix == '':
1767 self._download_n_results(query, 1)
1768 return
1769 elif prefix == 'all':
1770 self._download_n_results(query, self._max_google_results)
1771 return
1772 else:
1773 try:
1774 n = long(prefix)
1775 if n <= 0:
1776 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1777 return
1778 elif n > self._max_google_results:
1779 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1780 n = self._max_google_results
1781 self._download_n_results(query, n)
1782 return
1783 except ValueError: # parsing prefix as integer fails
1784 self._download_n_results(query, 1)
1785 return
1786
1787 def _download_n_results(self, query, n):
1788 """Downloads a specified number of results for a query"""
1789
1790 video_ids = []
1791 already_seen = set()
1792 pagenum = 1
1793
1794 while True:
1795 self.report_download_page(query, pagenum)
1796 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1797 request = urllib2.Request(result_url, None, std_headers)
1798 try:
1799 page = urllib2.urlopen(request).read()
1800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1802 return
1803
1804 # Extract video identifiers
1805 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1806 video_id = mobj.group(1)
1807 if video_id not in already_seen:
1808 video_ids.append(video_id)
1809 already_seen.add(video_id)
1810 if len(video_ids) == n:
1811 # Specified n videos reached
1812 for id in video_ids:
1813 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1814 return
1815
1816 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1817 for id in video_ids:
1818 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1819 return
1820
1821 pagenum = pagenum + 1
1822
1823class YahooSearchIE(InfoExtractor):
1824 """Information Extractor for Yahoo! Video search queries."""
1825 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1826 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1827 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1828 _MORE_PAGES_INDICATOR = r'\s*Next'
1829 _yahoo_ie = None
1830 _max_yahoo_results = 1000
1831
1832 def __init__(self, yahoo_ie, downloader=None):
1833 InfoExtractor.__init__(self, downloader)
1834 self._yahoo_ie = yahoo_ie
1835
1836 @staticmethod
1837 def suitable(url):
1838 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1839
1840 def report_download_page(self, query, pagenum):
1841 """Report attempt to download playlist page with given number."""
1842 query = query.decode(preferredencoding())
331ce0a0 1843 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
1844
1845 def _real_initialize(self):
1846 self._yahoo_ie.initialize()
1847
1848 def _real_extract(self, query):
1849 mobj = re.match(self._VALID_QUERY, query)
1850 if mobj is None:
1851 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1852 return
1853
1854 prefix, query = query.split(':')
1855 prefix = prefix[8:]
1856 query = query.encode('utf-8')
1857 if prefix == '':
1858 self._download_n_results(query, 1)
1859 return
1860 elif prefix == 'all':
1861 self._download_n_results(query, self._max_yahoo_results)
1862 return
1863 else:
1864 try:
1865 n = long(prefix)
1866 if n <= 0:
1867 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1868 return
1869 elif n > self._max_yahoo_results:
1870 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1871 n = self._max_yahoo_results
1872 self._download_n_results(query, n)
1873 return
1874 except ValueError: # parsing prefix as integer fails
1875 self._download_n_results(query, 1)
1876 return
1877
1878 def _download_n_results(self, query, n):
1879 """Downloads a specified number of results for a query"""
1880
1881 video_ids = []
1882 already_seen = set()
1883 pagenum = 1
1884
1885 while True:
1886 self.report_download_page(query, pagenum)
1887 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1888 request = urllib2.Request(result_url, None, std_headers)
1889 try:
1890 page = urllib2.urlopen(request).read()
1891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1892 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1893 return
1894
1895 # Extract video identifiers
1896 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1897 video_id = mobj.group(1)
1898 if video_id not in already_seen:
1899 video_ids.append(video_id)
1900 already_seen.add(video_id)
1901 if len(video_ids) == n:
1902 # Specified n videos reached
1903 for id in video_ids:
1904 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1905 return
1906
1907 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1908 for id in video_ids:
1909 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1910 return
1911
1912 pagenum = pagenum + 1
1913
0c2dc87d
RG
1914class YoutubePlaylistIE(InfoExtractor):
1915 """Information Extractor for YouTube playlists."""
1916
9177ce4d 1917 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
72ac78b8 1918 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
0c2dc87d 1919 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 1920 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d
RG
1921 _youtube_ie = None
1922
1923 def __init__(self, youtube_ie, downloader=None):
1924 InfoExtractor.__init__(self, downloader)
1925 self._youtube_ie = youtube_ie
1926
1927 @staticmethod
1928 def suitable(url):
1929 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1930
1931 def report_download_page(self, playlist_id, pagenum):
1932 """Report attempt to download playlist page with given number."""
331ce0a0 1933 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
1934
1935 def _real_initialize(self):
1936 self._youtube_ie.initialize()
1937
1938 def _real_extract(self, url):
1939 # Extract playlist id
1940 mobj = re.match(self._VALID_URL, url)
1941 if mobj is None:
147753eb 1942 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 1943 return
0c2dc87d
RG
1944
1945 # Download playlist pages
1946 playlist_id = mobj.group(1)
1947 video_ids = []
1948 pagenum = 1
1949
1950 while True:
1951 self.report_download_page(playlist_id, pagenum)
1952 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1953 try:
1954 page = urllib2.urlopen(request).read()
1955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1956 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 1957 return
0c2dc87d
RG
1958
1959 # Extract video identifiers
27d98b6e 1960 ids_in_page = []
0c2dc87d 1961 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
1962 if mobj.group(1) not in ids_in_page:
1963 ids_in_page.append(mobj.group(1))
1964 video_ids.extend(ids_in_page)
0c2dc87d 1965
ce5cafea 1966 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
1967 break
1968 pagenum = pagenum + 1
1969
8cc44341
RG
1970 playliststart = self._downloader.params.get('playliststart', 1) - 1
1971 playlistend = self._downloader.params.get('playlistend', -1)
1972 video_ids = video_ids[playliststart:playlistend]
1973
0c2dc87d 1974 for id in video_ids:
6f21f686
RG
1975 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1976 return
0c2dc87d 1977
c39c05cd
A
1978class YoutubeUserIE(InfoExtractor):
1979 """Information Extractor for YouTube users."""
1980
1981 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1982 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
75a4cf3c 1983 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
c39c05cd
A
1984 _youtube_ie = None
1985
1986 def __init__(self, youtube_ie, downloader=None):
1987 InfoExtractor.__init__(self, downloader)
1988 self._youtube_ie = youtube_ie
1989
1990 @staticmethod
1991 def suitable(url):
1992 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1993
1994 def report_download_page(self, username):
1995 """Report attempt to download user page."""
331ce0a0 1996 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
c39c05cd
A
1997
1998 def _real_initialize(self):
1999 self._youtube_ie.initialize()
2000
2001 def _real_extract(self, url):
2002 # Extract username
2003 mobj = re.match(self._VALID_URL, url)
2004 if mobj is None:
2005 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2006 return
2007
2008 # Download user page
2009 username = mobj.group(1)
2010 video_ids = []
2011 pagenum = 1
2012
2013 self.report_download_page(username)
2014 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2015 try:
2016 page = urllib2.urlopen(request).read()
2017 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2019 return
2020
2021 # Extract video identifiers
2022 ids_in_page = []
2023
2024 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
c39c05cd
A
2025 if mobj.group(1) not in ids_in_page:
2026 ids_in_page.append(mobj.group(1))
2027 video_ids.extend(ids_in_page)
2028
8cc44341
RG
2029 playliststart = self._downloader.params.get('playliststart', 1) - 1
2030 playlistend = self._downloader.params.get('playlistend', -1)
2031 video_ids = video_ids[playliststart:playlistend]
204c9398 2032
c39c05cd
A
2033 for id in video_ids:
2034 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2035 return
2036
65cd34c5
RG
2037class PostProcessor(object):
2038 """Post Processor class.
2039
2040 PostProcessor objects can be added to downloaders with their
2041 add_post_processor() method. When the downloader has finished a
2042 successful download, it will take its internal chain of PostProcessors
2043 and start calling the run() method on each one of them, first with
2044 an initial argument and then with the returned value of the previous
2045 PostProcessor.
2046
2047 The chain will be stopped if one of them ever returns None or the end
2048 of the chain is reached.
2049
2050 PostProcessor objects follow a "mutual registration" process similar
2051 to InfoExtractor objects.
2052 """
2053
2054 _downloader = None
2055
2056 def __init__(self, downloader=None):
2057 self._downloader = downloader
2058
65cd34c5
RG
2059 def set_downloader(self, downloader):
2060 """Sets the downloader for this PP."""
2061 self._downloader = downloader
2062
2063 def run(self, information):
2064 """Run the PostProcessor.
2065
2066 The "information" argument is a dictionary like the ones
2f11508a 2067 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
2068 one has an extra field called "filepath" that points to the
2069 downloaded file.
2070
2071 When this method returns None, the postprocessing chain is
2072 stopped. However, this method may return an information
2073 dictionary that will be passed to the next postprocessing
2074 object in the chain. It can be the one it received after
2075 changing some fields.
2076
2077 In addition, this method may raise a PostProcessingError
2078 exception that will be taken into account by the downloader
2079 it was called from.
2080 """
2081 return information # by default, do nothing
2082
2083### MAIN PROGRAM ###
4fa74b52
RG
2084if __name__ == '__main__':
2085 try:
f9f1e798 2086 # Modules needed only when running the main program
209e9e27 2087 import getpass
f9f1e798
RG
2088 import optparse
2089
4bec29ef
RG
2090 # Function to update the program file with the latest version from bitbucket.org
2091 def update_self(downloader, filename):
2092 # Note: downloader only used for options
2093 if not os.access (filename, os.W_OK):
2094 sys.exit('ERROR: no write permissions on %s' % filename)
2095
331ce0a0 2096 downloader.to_screen('Updating to latest stable version...')
893a13df 2097 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
4bec29ef 2098 latest_version = urllib.urlopen(latest_url).read().strip()
893a13df 2099 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
4bec29ef
RG
2100 newcontent = urllib.urlopen(prog_url).read()
2101 stream = open(filename, 'w')
2102 stream.write(newcontent)
2103 stream.close()
331ce0a0 2104 downloader.to_screen('Updated to version %s' % latest_version)
4bec29ef 2105
f9f1e798 2106 # Parse command line
209e9e27 2107 parser = optparse.OptionParser(
7b7759f5 2108 usage='Usage: %prog [options] url...',
c34e3584 2109 version='2010.10.24',
7b7759f5 2110 conflict_handler='resolve',
2111 )
2112
209e9e27
RG
2113 parser.add_option('-h', '--help',
2114 action='help', help='print this help text and exit')
2115 parser.add_option('-v', '--version',
2116 action='version', help='print program version and exit')
4bec29ef
RG
2117 parser.add_option('-U', '--update',
2118 action='store_true', dest='update_self', help='update this program to latest stable version')
7b7759f5 2119 parser.add_option('-i', '--ignore-errors',
2120 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2121 parser.add_option('-r', '--rate-limit',
2b06c33d 2122 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
7031008c 2123 parser.add_option('-R', '--retries',
2b06c33d 2124 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
204c9398
RG
2125 parser.add_option('--playlist-start',
2126 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
8cc44341
RG
2127 parser.add_option('--playlist-end',
2128 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
7b7759f5 2129
2130 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2131 authentication.add_option('-u', '--username',
2b06c33d 2132 dest='username', metavar='USERNAME', help='account username')
7b7759f5 2133 authentication.add_option('-p', '--password',
2b06c33d 2134 dest='password', metavar='PASSWORD', help='account password')
7b7759f5 2135 authentication.add_option('-n', '--netrc',
209e9e27 2136 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
7b7759f5 2137 parser.add_option_group(authentication)
2138
2139 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2140 video_format.add_option('-f', '--format',
2b06c33d 2141 action='store', dest='format', metavar='FORMAT', help='video format code')
7b7759f5 2142 video_format.add_option('-m', '--mobile-version',
b74c859d 2143 action='store_const', dest='format', help='alias for -f 17', const='17')
6ba562b0
RG
2144 video_format.add_option('--all-formats',
2145 action='store_const', dest='format', help='download all available video formats', const='-1')
f2413e67 2146 video_format.add_option('--max-quality',
460d8acb 2147 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2962317d
RG
2148 video_format.add_option('-b', '--best-quality',
2149 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
7b7759f5 2150 parser.add_option_group(video_format)
2151
2152 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2153 verbosity.add_option('-q', '--quiet',
2154 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2155 verbosity.add_option('-s', '--simulate',
2156 action='store_true', dest='simulate', help='do not download video', default=False)
2157 verbosity.add_option('-g', '--get-url',
2158 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2159 verbosity.add_option('-e', '--get-title',
2160 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
7e58d568
RG
2161 verbosity.add_option('--get-thumbnail',
2162 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2163 verbosity.add_option('--get-description',
2164 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
d9835247
RG
2165 verbosity.add_option('--no-progress',
2166 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
7b7759f5 2167 parser.add_option_group(verbosity)
2168
2169 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1c76e23e
RG
2170 filesystem.add_option('-t', '--title',
2171 action='store_true', dest='usetitle', help='use title in file name', default=False)
2172 filesystem.add_option('-l', '--literal',
2173 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1e47d226
NA
2174 filesystem.add_option('-A', '--auto-number',
2175 action='store_true', dest='autonumber', help='number downloaded URLs starting from 00000', default=False)
7b7759f5 2176 filesystem.add_option('-o', '--output',
2b06c33d 2177 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
7b7759f5 2178 filesystem.add_option('-a', '--batch-file',
2b06c33d 2179 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
7b7759f5 2180 filesystem.add_option('-w', '--no-overwrites',
0beeff4b 2181 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
f76c2df6
PI
2182 filesystem.add_option('-c', '--continue',
2183 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
80066952
RG
2184 filesystem.add_option('--cookies',
2185 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
7b7759f5 2186 parser.add_option_group(filesystem)
2187
209e9e27 2188 (opts, args) = parser.parse_args()
2a7353b8 2189
80066952
RG
2190 # Open appropriate CookieJar
2191 if opts.cookiefile is None:
2192 jar = cookielib.CookieJar()
2193 else:
2194 try:
2195 jar = cookielib.MozillaCookieJar(opts.cookiefile)
e0c982c8
RG
2196 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2197 jar.load()
80066952
RG
2198 except (IOError, OSError), err:
2199 sys.exit(u'ERROR: unable to open cookie file')
2200
2201 # General configuration
2202 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2203 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2204 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2205 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2206
c6fd0bb8 2207 # Batch file verification
d1580ed9 2208 batchurls = []
c6fd0bb8
RG
2209 if opts.batchfile is not None:
2210 try:
2a7353b8
RG
2211 if opts.batchfile == '-':
2212 batchfd = sys.stdin
2213 else:
2214 batchfd = open(opts.batchfile, 'r')
2215 batchurls = batchfd.readlines()
b65740e4 2216 batchurls = [x.strip() for x in batchurls]
817e8f52 2217 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
c6fd0bb8
RG
2218 except IOError:
2219 sys.exit(u'ERROR: batch file could not be read')
2220 all_urls = batchurls + args
2221
209e9e27 2222 # Conflicting, missing and erroneous options
2962317d
RG
2223 if opts.bestquality:
2224 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
209e9e27 2225 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2740c509 2226 parser.error(u'using .netrc conflicts with giving username/password')
209e9e27 2227 if opts.password is not None and opts.username is None:
2740c509 2228 parser.error(u'account username missing')
1e47d226
NA
2229 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2230 parser.error(u'using output template conflicts with using title, literal title or auto number')
209e9e27 2231 if opts.usetitle and opts.useliteral:
2740c509 2232 parser.error(u'using title conflicts with using literal title')
209e9e27 2233 if opts.username is not None and opts.password is None:
76a7f364 2234 opts.password = getpass.getpass(u'Type account password and press return:')
acd3d842
RG
2235 if opts.ratelimit is not None:
2236 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2237 if numeric_limit is None:
2740c509 2238 parser.error(u'invalid rate limit specified')
acd3d842 2239 opts.ratelimit = numeric_limit
7031008c
RG
2240 if opts.retries is not None:
2241 try:
2242 opts.retries = long(opts.retries)
2243 except (TypeError, ValueError), err:
2244 parser.error(u'invalid retry count specified')
8cc44341
RG
2245 try:
2246 opts.playliststart = long(opts.playliststart)
2247 if opts.playliststart <= 0:
2248 raise ValueError
2249 except (TypeError, ValueError), err:
2250 parser.error(u'invalid playlist start number specified')
2251 try:
2252 opts.playlistend = long(opts.playlistend)
2253 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2254 raise ValueError
2255 except (TypeError, ValueError), err:
2256 parser.error(u'invalid playlist end number specified')
4fa74b52
RG
2257
2258 # Information extractors
2259 youtube_ie = YoutubeIE()
020f7150 2260 metacafe_ie = MetacafeIE(youtube_ie)
4135fa45 2261 dailymotion_ie = DailymotionIE()
0c2dc87d 2262 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
c39c05cd 2263 youtube_user_ie = YoutubeUserIE(youtube_ie)
25af2bce 2264 youtube_search_ie = YoutubeSearchIE(youtube_ie)
49c0028a 2265 google_ie = GoogleIE()
7e58d568 2266 google_search_ie = GoogleSearchIE(google_ie)
49c0028a 2267 photobucket_ie = PhotobucketIE()
61945318 2268 yahoo_ie = YahooIE()
7e58d568 2269 yahoo_search_ie = YahooSearchIE(yahoo_ie)
490fd7ae 2270 generic_ie = GenericIE()
4fa74b52
RG
2271
2272 # File downloader
9fcd8355 2273 fd = FileDownloader({
209e9e27
RG
2274 'usenetrc': opts.usenetrc,
2275 'username': opts.username,
2276 'password': opts.password,
7e58d568 2277 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
209e9e27
RG
2278 'forceurl': opts.geturl,
2279 'forcetitle': opts.gettitle,
7e58d568
RG
2280 'forcethumbnail': opts.getthumbnail,
2281 'forcedescription': opts.getdescription,
2282 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
320becd6 2283 'format': opts.format,
f2413e67 2284 'format_limit': opts.format_limit,
eae2666c 2285 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
6ba562b0
RG
2286 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2287 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2288 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
1e47d226
NA
2289 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2290 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
76a7f364
RG
2291 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2292 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1e47d226 2293 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
76a7f364 2294 or u'%(id)s.%(ext)s'),
0086d1ec 2295 'ignoreerrors': opts.ignoreerrors,
acd3d842 2296 'ratelimit': opts.ratelimit,
0beeff4b 2297 'nooverwrites': opts.nooverwrites,
7031008c 2298 'retries': opts.retries,
7db85b2c 2299 'continuedl': opts.continue_dl,
d9835247 2300 'noprogress': opts.noprogress,
204c9398 2301 'playliststart': opts.playliststart,
8cc44341 2302 'playlistend': opts.playlistend,
331ce0a0 2303 'logtostderr': opts.outtmpl == '-',
9fcd8355 2304 })
25af2bce 2305 fd.add_info_extractor(youtube_search_ie)
0c2dc87d 2306 fd.add_info_extractor(youtube_pl_ie)
c39c05cd 2307 fd.add_info_extractor(youtube_user_ie)
020f7150 2308 fd.add_info_extractor(metacafe_ie)
4135fa45 2309 fd.add_info_extractor(dailymotion_ie)
4fa74b52 2310 fd.add_info_extractor(youtube_ie)
49c0028a 2311 fd.add_info_extractor(google_ie)
7e58d568 2312 fd.add_info_extractor(google_search_ie)
49c0028a 2313 fd.add_info_extractor(photobucket_ie)
61945318 2314 fd.add_info_extractor(yahoo_ie)
7e58d568 2315 fd.add_info_extractor(yahoo_search_ie)
4bec29ef 2316
490fd7ae
RG
2317 # This must come last since it's the
2318 # fallback if none of the others work
2319 fd.add_info_extractor(generic_ie)
2320
4bec29ef
RG
2321 # Update version
2322 if opts.update_self:
2323 update_self(fd, sys.argv[0])
2324
2325 # Maybe do nothing
2326 if len(all_urls) < 1:
2327 if not opts.update_self:
2328 parser.error(u'you must provide at least one URL')
2329 else:
2330 sys.exit()
c6fd0bb8 2331 retcode = fd.download(all_urls)
80066952
RG
2332
2333 # Dump cookie jar if requested
2334 if opts.cookiefile is not None:
2335 try:
2336 jar.save()
2337 except (IOError, OSError), err:
2338 sys.exit(u'ERROR: unable to save cookie jar')
2339
bb681b88 2340 sys.exit(retcode)
4fa74b52 2341
e5bf0f55
RG
2342 except DownloadError:
2343 sys.exit(1)
2344 except SameFileError:
76a7f364 2345 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 2346 except KeyboardInterrupt:
76a7f364 2347 sys.exit(u'\nERROR: Interrupted by user')