]> jfr.im git - yt-dlp.git/blame - youtube-dl
Retry on any 5xx server error
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Author: Ricardo Garcia Gonzalez
64a6f26c 4# Author: Danny Colligan
49c0028a 5# Author: Benjamin Johnson
4fa74b52
RG
6# License: Public domain code
7import htmlentitydefs
8import httplib
2546e767 9import locale
4fa74b52
RG
10import math
11import netrc
12import os
13import os.path
14import re
15import socket
16import string
0487b407 17import subprocess
4fa74b52
RG
18import sys
19import time
20import urllib
21import urllib2
a04e80a4
RG
22
23# parse_qs was moved from the cgi module to the urlparse module recently.
24try:
25 from urlparse import parse_qs
26except ImportError:
27 from cgi import parse_qs
4fa74b52 28
f995f712 29std_headers = {
813962f8 30 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
4fa74b52 31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 32 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
4fa74b52
RG
33 'Accept-Language': 'en-us,en;q=0.5',
34}
35
36simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
eae2666c
RG
38def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
f94b636c
RG
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
eae2666c 53
490fd7ae
RG
54def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80def sanitize_title(utitle):
31bcb480 81 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
83 return utitle.replace(unicode(os.sep), u'%')
84
31bcb480
RG
85def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
131bc765
RG
96 if filename == u'-':
97 return (sys.stdout, filename)
31bcb480
RG
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
ca6a11fa 102 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
103
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
107
108
e5bf0f55
RG
109class DownloadError(Exception):
110 """Download Error exception.
111
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
114 error message.
115 """
116 pass
117
118class SameFileError(Exception):
119 """Same File exception.
120
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
123 """
124 pass
125
65cd34c5
RG
126class PostProcessingError(Exception):
127 """Post Processing exception.
128
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
131 """
132 pass
133
73f4e7af 134class UnavailableVideoError(Exception):
7b7759f5 135 """Unavailable Format exception.
136
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
139 """
d69a1c91
RG
140 pass
141
142class ContentTooShortError(Exception):
143 """Content Too Short exception.
144
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
148 """
149 # Both in bytes
150 downloaded = None
151 expected = None
152
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
7b7759f5 156
4fa74b52
RG
157class FileDownloader(object):
158 """File Downloader class.
159
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
166
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
4fa74b52
RG
174
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
4fa74b52
RG
181
182 Available options:
183
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
05a84b35
RG
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
b609fd54 190 simulate: Do not download the video files.
4fa74b52 191 format: Video format code.
f2413e67 192 format_limit: Highest quality format to try.
4fa74b52 193 outtmpl: Template for output names.
0086d1ec 194 ignoreerrors: Do not stop on download errors.
acd3d842 195 ratelimit: Download speed limit, in bytes/sec.
0beeff4b 196 nooverwrites: Prevent overwriting files.
e86e9474 197 retries: Number of times to retry for HTTP error 5xx
0cd61126 198 continuedl: Try to continue downloads if possible.
d9835247 199 noprogress: Do not print the progress bar.
4fa74b52
RG
200 """
201
d0a9affb 202 params = None
4fa74b52 203 _ies = []
65cd34c5 204 _pps = []
9bf386d7 205 _download_retcode = None
7d8d0612 206 _num_downloads = None
4fa74b52
RG
207
208 def __init__(self, params):
1c5e2302 209 """Create a FileDownloader object with the given options."""
4fa74b52 210 self._ies = []
65cd34c5 211 self._pps = []
9bf386d7 212 self._download_retcode = 0
7d8d0612 213 self._num_downloads = 0
d0a9affb 214 self.params = params
4fa74b52
RG
215
216 @staticmethod
217 def pmkdir(filename):
218 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219 components = filename.split(os.sep)
220 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
3af1e172 221 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
4fa74b52
RG
222 for dir in aggregate:
223 if not os.path.exists(dir):
224 os.mkdir(dir)
225
226 @staticmethod
227 def format_bytes(bytes):
228 if bytes is None:
229 return 'N/A'
8497c36d
RG
230 if type(bytes) is str:
231 bytes = float(bytes)
232 if bytes == 0.0:
4fa74b52
RG
233 exponent = 0
234 else:
8497c36d 235 exponent = long(math.log(bytes, 1024.0))
4fa74b52 236 suffix = 'bkMGTPEZY'[exponent]
4fa74b52
RG
237 converted = float(bytes) / float(1024**exponent)
238 return '%.2f%s' % (converted, suffix)
239
240 @staticmethod
241 def calc_percent(byte_counter, data_len):
242 if data_len is None:
243 return '---.-%'
244 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246 @staticmethod
247 def calc_eta(start, now, total, current):
248 if total is None:
249 return '--:--'
250 dif = now - start
251 if current == 0 or dif < 0.001: # One millisecond
252 return '--:--'
253 rate = float(current) / dif
254 eta = long((float(total) - float(current)) / rate)
255 (eta_mins, eta_secs) = divmod(eta, 60)
256 if eta_mins > 99:
257 return '--:--'
258 return '%02d:%02d' % (eta_mins, eta_secs)
259
5121ef20 260 @staticmethod
4fa74b52
RG
261 def calc_speed(start, now, bytes):
262 dif = now - start
263 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 264 return '%10s' % '---b/s'
4fa74b52
RG
265 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267 @staticmethod
268 def best_block_size(elapsed_time, bytes):
269 new_min = max(bytes / 2.0, 1.0)
270 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271 if elapsed_time < 0.001:
e1f18b8a 272 return long(new_max)
4fa74b52
RG
273 rate = bytes / elapsed_time
274 if rate > new_max:
e1f18b8a 275 return long(new_max)
4fa74b52 276 if rate < new_min:
e1f18b8a
RG
277 return long(new_min)
278 return long(rate)
4fa74b52 279
acd3d842
RG
280 @staticmethod
281 def parse_bytes(bytestr):
282 """Parse a string indicating a byte quantity into a long integer."""
283 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 if matchobj is None:
285 return None
286 number = float(matchobj.group(1))
287 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288 return long(round(number * multiplier))
289
4fa74b52
RG
290 def add_info_extractor(self, ie):
291 """Add an InfoExtractor object to the end of the list."""
292 self._ies.append(ie)
293 ie.set_downloader(self)
294
65cd34c5
RG
295 def add_post_processor(self, pp):
296 """Add a PostProcessor object to the end of the chain."""
297 self._pps.append(pp)
298 pp.set_downloader(self)
299
43ab0ca4 300 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 301 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
302 try:
303 if not self.params.get('quiet', False):
304 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
9fcd8355 305 sys.stdout.flush()
43ab0ca4
RG
306 except (UnicodeEncodeError), err:
307 if not ignore_encoding_errors:
308 raise
7e5cab67
RG
309
310 def to_stderr(self, message):
311 """Print message to stderr."""
eae2666c 312 print >>sys.stderr, message.encode(preferredencoding())
22899cea
RG
313
314 def fixed_template(self):
315 """Checks if the output template is fixed."""
d0a9affb 316 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 317
0086d1ec
RG
318 def trouble(self, message=None):
319 """Determine action to take when a download problem appears.
320
321 Depending on if the downloader has been configured to ignore
e5bf0f55 322 download errors or not, this method may throw an exception or
9bf386d7 323 not when errors are found, after printing the message.
0086d1ec
RG
324 """
325 if message is not None:
326 self.to_stderr(message)
d0a9affb 327 if not self.params.get('ignoreerrors', False):
e5bf0f55 328 raise DownloadError(message)
9bf386d7 329 self._download_retcode = 1
0086d1ec 330
acd3d842
RG
331 def slow_down(self, start_time, byte_counter):
332 """Sleep if the download speed is over the rate limit."""
d0a9affb 333 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
334 if rate_limit is None or byte_counter == 0:
335 return
336 now = time.time()
337 elapsed = now - start_time
338 if elapsed <= 0.0:
339 return
340 speed = float(byte_counter) / elapsed
341 if speed > rate_limit:
342 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
bafa5cd9
RG
344 def report_destination(self, filename):
345 """Report destination filename."""
43ab0ca4 346 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
bafa5cd9
RG
347
348 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349 """Report download progress."""
d9835247
RG
350 if self.params.get('noprogress', False):
351 return
76a7f364 352 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 353 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
7db85b2c
RG
354
355 def report_resuming_byte(self, resume_len):
8a9f53be 356 """Report attempt to resume at given byte."""
7db85b2c
RG
357 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358
7031008c 359 def report_retry(self, count, retries):
e86e9474
RG
360 """Report retry in case of HTTP error 5xx"""
361 self.to_stdout(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
7031008c 362
7db85b2c
RG
363 def report_file_already_downloaded(self, file_name):
364 """Report file has already been fully downloaded."""
43ab0ca4
RG
365 try:
366 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367 except (UnicodeEncodeError), err:
368 self.to_stdout(u'[download] The file has already been downloaded')
7db85b2c
RG
369
370 def report_unable_to_resume(self):
371 """Report it was impossible to resume download."""
372 self.to_stdout(u'[download] Unable to resume')
bafa5cd9
RG
373
374 def report_finish(self):
375 """Report download finished."""
d9835247
RG
376 if self.params.get('noprogress', False):
377 self.to_stdout(u'[download] Download completed')
378 else:
379 self.to_stdout(u'')
df372a65
RG
380
381 def increment_downloads(self):
382 """Increment the ordinal that assigns a number to each file."""
383 self._num_downloads += 1
bafa5cd9 384
c8619e01
RG
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
c8619e01
RG
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
cbfff4db
RG
389 # Forced printings
390 if self.params.get('forcetitle', False):
490fd7ae 391 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 392 if self.params.get('forceurl', False):
490fd7ae 393 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
7e58d568
RG
394 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396 if self.params.get('forcedescription', False) and 'description' in info_dict:
397 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 398
9bf386d7 399 return
cbfff4db 400
c8619e01 401 try:
ad274509
RG
402 template_dict = dict(info_dict)
403 template_dict['epoch'] = unicode(long(time.time()))
7d8d0612 404 template_dict['ord'] = unicode('%05d' % self._num_downloads)
ad274509 405 filename = self.params['outtmpl'] % template_dict
c8619e01 406 except (ValueError, KeyError), err:
38ed1344
RG
407 self.trouble(u'ERROR: invalid system charset or erroneous output template')
408 return
850ab765 409 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 410 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 411 return
7b7759f5 412
c8619e01
RG
413 try:
414 self.pmkdir(filename)
415 except (OSError, IOError), err:
db7e31b8 416 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
9bf386d7 417 return
7b7759f5 418
c8619e01 419 try:
e616ec0c 420 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
c8619e01 421 except (OSError, IOError), err:
73f4e7af 422 raise UnavailableVideoError
c8619e01 423 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
db7e31b8 424 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
9bf386d7 425 return
d69a1c91 426 except (ContentTooShortError, ), err:
db7e31b8 427 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
d69a1c91 428 return
7b7759f5 429
55e7c75e
RG
430 if success:
431 try:
432 self.post_process(filename, info_dict)
433 except (PostProcessingError), err:
db7e31b8 434 self.trouble(u'ERROR: postprocessing: %s' % str(err))
55e7c75e 435 return
c8619e01 436
4fa74b52
RG
437 def download(self, url_list):
438 """Download a given list of URLs."""
22899cea 439 if len(url_list) > 1 and self.fixed_template():
d0a9affb 440 raise SameFileError(self.params['outtmpl'])
22899cea 441
4fa74b52
RG
442 for url in url_list:
443 suitable_found = False
444 for ie in self._ies:
c8619e01 445 # Go to next InfoExtractor if not suitable
4fa74b52
RG
446 if not ie.suitable(url):
447 continue
c8619e01 448
4fa74b52
RG
449 # Suitable InfoExtractor found
450 suitable_found = True
c8619e01 451
6f21f686
RG
452 # Extract information from URL and process it
453 ie.extract(url)
65cd34c5 454
c8619e01 455 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 456 break
c8619e01 457
4fa74b52 458 if not suitable_found:
db7e31b8 459 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 460
9bf386d7 461 return self._download_retcode
65cd34c5
RG
462
463 def post_process(self, filename, ie_info):
464 """Run the postprocessing chain on the given file."""
465 info = dict(ie_info)
466 info['filepath'] = filename
467 for pp in self._pps:
468 info = pp.run(info)
469 if info is None:
470 break
4fa74b52 471
e616ec0c 472 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407
RG
473 self.report_destination(filename)
474
475 # Check for rtmpdump first
476 try:
477 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
478 except (OSError, IOError):
479 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
480 return False
481
482 # Download using rtmpdump. rtmpdump returns exit code 2 when
483 # the connection was interrumpted and resuming appears to be
484 # possible. This is part of rtmpdump's normal usage, AFAIK.
e616ec0c 485 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
1c1821f8
RG
486 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
487 while retval == 2 or retval == 1:
e616ec0c
RG
488 prevsize = os.path.getsize(filename)
489 self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
490 time.sleep(5.0) # This seems to be needed
1c1821f8 491 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
e616ec0c
RG
492 cursize = os.path.getsize(filename)
493 if prevsize == cursize and retval == 1:
494 break
0487b407
RG
495 if retval == 0:
496 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
497 return True
498 else:
db7e31b8 499 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
500 return False
501
e616ec0c 502 def _do_download(self, filename, url, player_url):
0487b407
RG
503 # Attempt to download using rtmpdump
504 if url.startswith('rtmp'):
e616ec0c 505 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 506
55e7c75e 507 stream = None
9c457d2a 508 open_mode = 'wb'
7db85b2c 509 basic_request = urllib2.Request(url, None, std_headers)
4fa74b52 510 request = urllib2.Request(url, None, std_headers)
7db85b2c 511
9c457d2a 512 # Establish possible resume length
55e7c75e
RG
513 if os.path.isfile(filename):
514 resume_len = os.path.getsize(filename)
515 else:
516 resume_len = 0
9c457d2a
RG
517
518 # Request parameters in case of being able to resume
850ab765 519 if self.params.get('continuedl', False) and resume_len != 0:
7db85b2c
RG
520 self.report_resuming_byte(resume_len)
521 request.add_header('Range','bytes=%d-' % resume_len)
9c457d2a 522 open_mode = 'ab'
55e7c75e 523
7031008c
RG
524 count = 0
525 retries = self.params.get('retries', 0)
101e0d1e 526 while count <= retries:
7031008c
RG
527 # Establish connection
528 try:
529 data = urllib2.urlopen(request)
530 break
531 except (urllib2.HTTPError, ), err:
ac249f42 532 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 533 # Unexpected HTTP error
7031008c 534 raise
101e0d1e
RG
535 elif err.code == 416:
536 # Unable to resume (requested range not satisfiable)
537 try:
538 # Open the connection again without the range header
539 data = urllib2.urlopen(basic_request)
540 content_length = data.info()['Content-Length']
541 except (urllib2.HTTPError, ), err:
ac249f42 542 if err.code < 500 or err.code >= 600:
101e0d1e
RG
543 raise
544 else:
545 # Examine the reported length
268fb2bd 546 if (content_length is not None and
204c9398 547 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
548 # The file had already been fully downloaded.
549 # Explanation to the above condition: in issue #175 it was revealed that
550 # YouTube sometimes adds or removes a few bytes from the end of the file,
551 # changing the file size slightly and causing problems for some users. So
552 # I decided to implement a suggested change and consider the file
553 # completely downloaded if the file size differs less than 100 bytes from
554 # the one in the hard drive.
101e0d1e
RG
555 self.report_file_already_downloaded(filename)
556 return True
557 else:
558 # The length does not match, we start the download over
559 self.report_unable_to_resume()
560 open_mode = 'wb'
561 break
562 # Retry
563 count += 1
564 if count <= retries:
565 self.report_retry(count, retries)
566
567 if count > retries:
568 self.trouble(u'ERROR: giving up after %s retries' % retries)
569 return False
7db85b2c 570
4fa74b52
RG
571 data_len = data.info().get('Content-length', None)
572 data_len_str = self.format_bytes(data_len)
573 byte_counter = 0
574 block_size = 1024
575 start = time.time()
576 while True:
bafa5cd9 577 # Download and write
4fa74b52
RG
578 before = time.time()
579 data_block = data.read(block_size)
580 after = time.time()
581 data_block_len = len(data_block)
582 if data_block_len == 0:
583 break
584 byte_counter += data_block_len
55e7c75e
RG
585
586 # Open file just in time
587 if stream is None:
588 try:
31bcb480 589 (stream, filename) = sanitize_open(filename, open_mode)
55e7c75e
RG
590 self.report_destination(filename)
591 except (OSError, IOError), err:
db7e31b8 592 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 593 return False
131efd1a
RG
594 try:
595 stream.write(data_block)
596 except (IOError, OSError), err:
d67e0974
RG
597 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
598 return False
4fa74b52
RG
599 block_size = self.best_block_size(after - before, data_block_len)
600
55e7c75e
RG
601 # Progress message
602 percent_str = self.calc_percent(byte_counter, data_len)
603 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
604 speed_str = self.calc_speed(start, time.time(), byte_counter)
605 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
606
acd3d842
RG
607 # Apply rate limit
608 self.slow_down(start, byte_counter)
609
bafa5cd9 610 self.report_finish()
4fa74b52 611 if data_len is not None and str(byte_counter) != data_len:
d69a1c91 612 raise ContentTooShortError(byte_counter, long(data_len))
55e7c75e 613 return True
4fa74b52
RG
614
615class InfoExtractor(object):
616 """Information Extractor class.
617
618 Information extractors are the classes that, given a URL, extract
619 information from the video (or videos) the URL refers to. This
620 information includes the real video URL, the video title and simplified
2851b2ca
RG
621 title, author and others. The information is stored in a dictionary
622 which is then passed to the FileDownloader. The FileDownloader
623 processes this information possibly downloading the video to the file
624 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
625 the following fields:
626
627 id: Video identifier.
628 url: Final video URL.
629 uploader: Nickname of the video uploader.
630 title: Literal title.
631 stitle: Simplified title.
632 ext: Video filename extension.
6ba562b0 633 format: Video format.
e616ec0c 634 player_url: SWF Player URL (may be None).
4fa74b52 635
7e58d568
RG
636 The following fields are optional. Their primary purpose is to allow
637 youtube-dl to serve as the backend for a video search function, such
638 as the one in youtube2mp3. They are only used when their respective
639 forced printing functions are called:
640
641 thumbnail: Full URL to a video thumbnail image.
642 description: One-line video description.
643
4fa74b52
RG
644 Subclasses of this one should re-define the _real_initialize() and
645 _real_extract() methods, as well as the suitable() static method.
646 Probably, they should also be instantiated and added to the main
647 downloader.
648 """
649
650 _ready = False
651 _downloader = None
652
653 def __init__(self, downloader=None):
654 """Constructor. Receives an optional downloader."""
655 self._ready = False
656 self.set_downloader(downloader)
657
658 @staticmethod
659 def suitable(url):
660 """Receives a URL and returns True if suitable for this IE."""
020f7150 661 return False
4fa74b52
RG
662
663 def initialize(self):
1c5e2302 664 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
665 if not self._ready:
666 self._real_initialize()
667 self._ready = True
668
669 def extract(self, url):
670 """Extracts URL information and returns it in list of dicts."""
671 self.initialize()
672 return self._real_extract(url)
673
674 def set_downloader(self, downloader):
675 """Sets the downloader for this IE."""
676 self._downloader = downloader
677
4fa74b52
RG
678 def _real_initialize(self):
679 """Real initialization process. Redefine in subclasses."""
680 pass
681
682 def _real_extract(self, url):
683 """Real extraction process. Redefine in subclasses."""
684 pass
685
686class YoutubeIE(InfoExtractor):
687 """Information extractor for youtube.com."""
688
bbd4bb03 689 _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 690 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
72ac78b8
RG
691 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
692 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 693 _NETRC_MACHINE = 'youtube'
497cd3e6
RG
694 # Listed in order of quality
695 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
7b7759f5 696 _video_extensions = {
697 '13': '3gp',
698 '17': 'mp4',
699 '18': 'mp4',
700 '22': 'mp4',
d9bc015b 701 '37': 'mp4',
9e9647d9 702 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a
RG
703 '43': 'webm',
704 '45': 'webm',
7b7759f5 705 }
4fa74b52 706
020f7150
RG
707 @staticmethod
708 def suitable(url):
709 return (re.match(YoutubeIE._VALID_URL, url) is not None)
710
72ac78b8
RG
711 def report_lang(self):
712 """Report attempt to set language."""
3aaf887e 713 self._downloader.to_stdout(u'[youtube] Setting language')
72ac78b8 714
bafa5cd9
RG
715 def report_login(self):
716 """Report attempt to log in."""
3aaf887e 717 self._downloader.to_stdout(u'[youtube] Logging in')
bafa5cd9
RG
718
719 def report_age_confirmation(self):
720 """Report attempt to confirm age."""
3aaf887e 721 self._downloader.to_stdout(u'[youtube] Confirming age')
bafa5cd9 722
e616ec0c
RG
723 def report_video_webpage_download(self, video_id):
724 """Report attempt to download video webpage."""
725 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
726
71b7300e
RG
727 def report_video_info_webpage_download(self, video_id):
728 """Report attempt to download video info webpage."""
729 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
bafa5cd9
RG
730
731 def report_information_extraction(self, video_id):
732 """Report attempt to extract video information."""
3aaf887e 733 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
bafa5cd9 734
7b7759f5 735 def report_unavailable_format(self, video_id, format):
736 """Report extracted video URL."""
737 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
738
0487b407
RG
739 def report_rtmp_download(self):
740 """Indicate the download will use the RTMP protocol."""
741 self._downloader.to_stdout(u'[youtube] RTMP download detected')
742
4fa74b52
RG
743 def _real_initialize(self):
744 if self._downloader is None:
745 return
746
747 username = None
748 password = None
d0a9affb 749 downloader_params = self._downloader.params
4fa74b52
RG
750
751 # Attempt to use provided username and password or .netrc data
752 if downloader_params.get('username', None) is not None:
753 username = downloader_params['username']
754 password = downloader_params['password']
755 elif downloader_params.get('usenetrc', False):
756 try:
757 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
758 if info is not None:
759 username = info[0]
760 password = info[2]
761 else:
762 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
763 except (IOError, netrc.NetrcParseError), err:
6f21f686 764 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
765 return
766
72ac78b8 767 # Set language
cc109403 768 request = urllib2.Request(self._LANG_URL, None, std_headers)
72ac78b8
RG
769 try:
770 self.report_lang()
771 urllib2.urlopen(request).read()
772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 773 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
774 return
775
cc109403
RG
776 # No authentication to be performed
777 if username is None:
778 return
779
4fa74b52 780 # Log in
9fcd8355
RG
781 login_form = {
782 'current_form': 'loginForm',
4fa74b52
RG
783 'next': '/',
784 'action_login': 'Log In',
785 'username': username,
9fcd8355
RG
786 'password': password,
787 }
4fa74b52
RG
788 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
789 try:
bafa5cd9 790 self.report_login()
4fa74b52
RG
791 login_results = urllib2.urlopen(request).read()
792 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 793 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
794 return
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 796 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52
RG
797 return
798
799 # Confirm age
9fcd8355
RG
800 age_form = {
801 'next_url': '/',
802 'action_confirm': 'Confirm',
803 }
4fa74b52
RG
804 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
805 try:
bafa5cd9 806 self.report_age_confirmation()
4fa74b52
RG
807 age_results = urllib2.urlopen(request).read()
808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 809 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 810 return
4fa74b52
RG
811
812 def _real_extract(self, url):
813 # Extract video id from URL
020f7150 814 mobj = re.match(self._VALID_URL, url)
4fa74b52 815 if mobj is None:
147753eb 816 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 817 return
4fa74b52
RG
818 video_id = mobj.group(2)
819
497cd3e6
RG
820 # Get video webpage
821 self.report_video_webpage_download(video_id)
822 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
823 try:
824 video_webpage = urllib2.urlopen(request).read()
825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
826 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
827 return
968aa884 828
497cd3e6
RG
829 # Attempt to extract SWF player URL
830 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
831 if mobj is not None:
832 player_url = mobj.group(1)
833 else:
834 player_url = None
835
836 # Get video info
837 self.report_video_info_webpage_download(video_id)
838 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
839 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
840 % (video_id, el_type))
841 request = urllib2.Request(video_info_url, None, std_headers)
e616ec0c 842 try:
497cd3e6
RG
843 video_info_webpage = urllib2.urlopen(request).read()
844 video_info = parse_qs(video_info_webpage)
845 if 'token' in video_info:
846 break
e616ec0c 847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 848 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 849 return
f95f29fd
RG
850 if 'token' not in video_info:
851 if 'reason' in video_info:
8e686771 852 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
853 else:
854 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
855 return
856
857 # Start extracting information
497cd3e6
RG
858 self.report_information_extraction(video_id)
859
860 # uploader
861 if 'author' not in video_info:
862 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
863 return
864 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 865
497cd3e6
RG
866 # title
867 if 'title' not in video_info:
868 self._downloader.trouble(u'ERROR: unable to extract video title')
869 return
870 video_title = urllib.unquote_plus(video_info['title'][0])
871 video_title = video_title.decode('utf-8')
872 video_title = sanitize_title(video_title)
873
874 # simplified title
875 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
876 simple_title = simple_title.strip(ur'_')
877
878 # thumbnail image
879 if 'thumbnail_url' not in video_info:
880 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
881 video_thumbnail = ''
882 else: # don't panic if we can't find it
883 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
884
885 # description
886 video_description = 'No description available.'
887 if self._downloader.params.get('forcedescription', False):
888 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
e616ec0c 889 if mobj is not None:
497cd3e6
RG
890 video_description = mobj.group(1)
891
5ce7d172
RG
892 # token
893 video_token = urllib.unquote_plus(video_info['token'][0])
894
497cd3e6 895 # Decide which formats to download
2e3a32e4 896 requested_format = self._downloader.params.get('format', None)
5ce7d172 897 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
2e3a32e4 898
5ce7d172 899 if 'fmt_url_map' in video_info:
497cd3e6
RG
900 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
901 format_limit = self._downloader.params.get('format_limit', None)
902 if format_limit is not None and format_limit in self._available_formats:
903 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 904 else:
497cd3e6
RG
905 format_list = self._available_formats
906 existing_formats = [x for x in format_list if x in url_map]
907 if len(existing_formats) == 0:
908 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 909 return
497cd3e6 910 if requested_format is None:
5ce7d172 911 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
497cd3e6 912 elif requested_format == '-1':
5ce7d172 913 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
497cd3e6 914 else:
5ce7d172 915 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
2e3a32e4 916
497cd3e6
RG
917 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
918 self.report_rtmp_download()
919 video_url_list = [(None, video_info['conn'][0])]
2e3a32e4 920
497cd3e6
RG
921 else:
922 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
923 return
7b7759f5 924
497cd3e6
RG
925 for format_param, video_real_url in video_url_list:
926 # At this point we have a new video
927 self._downloader.increment_downloads()
928
929 # Extension
930 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 931
497cd3e6 932 # Find the video URL in fmt_url_map or conn paramters
968aa884 933 try:
7b7759f5 934 # Process video information
935 self._downloader.process_info({
936 'id': video_id.decode('utf-8'),
937 'url': video_real_url.decode('utf-8'),
938 'uploader': video_uploader.decode('utf-8'),
939 'title': video_title,
940 'stitle': simple_title,
941 'ext': video_extension.decode('utf-8'),
6ba562b0 942 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568
RG
943 'thumbnail': video_thumbnail.decode('utf-8'),
944 'description': video_description.decode('utf-8'),
e616ec0c 945 'player_url': player_url,
7b7759f5 946 })
497cd3e6 947 except UnavailableVideoError, err:
5ce7d172 948 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
42bcd27d 949
4fa74b52 950
020f7150
RG
951class MetacafeIE(InfoExtractor):
952 """Information Extractor for metacafe.com."""
953
954 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 955 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 956 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150
RG
957 _youtube_ie = None
958
959 def __init__(self, youtube_ie, downloader=None):
960 InfoExtractor.__init__(self, downloader)
961 self._youtube_ie = youtube_ie
962
963 @staticmethod
964 def suitable(url):
965 return (re.match(MetacafeIE._VALID_URL, url) is not None)
966
967 def report_disclaimer(self):
968 """Report disclaimer retrieval."""
3aaf887e 969 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
020f7150
RG
970
971 def report_age_confirmation(self):
972 """Report attempt to confirm age."""
3aaf887e 973 self._downloader.to_stdout(u'[metacafe] Confirming age')
020f7150
RG
974
975 def report_download_webpage(self, video_id):
976 """Report webpage download."""
3aaf887e 977 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
020f7150
RG
978
979 def report_extraction(self, video_id):
980 """Report information extraction."""
3aaf887e 981 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
982
983 def _real_initialize(self):
984 # Retrieve disclaimer
985 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
986 try:
987 self.report_disclaimer()
988 disclaimer = urllib2.urlopen(request).read()
989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 990 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
991 return
992
993 # Confirm age
994 disclaimer_form = {
2546e767 995 'filters': '0',
020f7150
RG
996 'submit': "Continue - I'm over 18",
997 }
dbccb6cd 998 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
020f7150
RG
999 try:
1000 self.report_age_confirmation()
1001 disclaimer = urllib2.urlopen(request).read()
1002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1003 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150
RG
1004 return
1005
1006 def _real_extract(self, url):
1007 # Extract id and simplified title from URL
1008 mobj = re.match(self._VALID_URL, url)
1009 if mobj is None:
147753eb 1010 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1011 return
020f7150
RG
1012
1013 video_id = mobj.group(1)
1014
1015 # Check if video comes from YouTube
1016 mobj2 = re.match(r'^yt-(.*)$', video_id)
1017 if mobj2 is not None:
6f21f686
RG
1018 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1019 return
020f7150 1020
df372a65 1021 # At this point we have a new video
9bf7fa52 1022 self._downloader.increment_downloads()
df372a65 1023
020f7150 1024 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1025
1026 # Retrieve video webpage to extract further information
1027 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1028 try:
1029 self.report_download_webpage(video_id)
1030 webpage = urllib2.urlopen(request).read()
1031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1032 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1033 return
020f7150
RG
1034
1035 # Extract URL, uploader and title from webpage
1036 self.report_extraction(video_id)
18963a36 1037 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1038 if mobj is not None:
1039 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1040 video_extension = mediaURL[-3:]
c6c555cf
RG
1041
1042 # Extract gdaKey if available
1043 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1044 if mobj is None:
1045 video_url = mediaURL
1046 else:
1047 gdaKey = mobj.group(1)
1048 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1049 else:
c6c555cf
RG
1050 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1051 if mobj is None:
1052 self._downloader.trouble(u'ERROR: unable to extract media URL')
1053 return
1054 vardict = parse_qs(mobj.group(1))
1055 if 'mediaData' not in vardict:
1056 self._downloader.trouble(u'ERROR: unable to extract media URL')
1057 return
1058 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1059 if mobj is None:
1060 self._downloader.trouble(u'ERROR: unable to extract media URL')
1061 return
6b57e8c5
RG
1062 mediaURL = mobj.group(1).replace('\\/', '/')
1063 video_extension = mediaURL[-3:]
1064 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1065
2546e767 1066 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1067 if mobj is None:
147753eb 1068 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1069 return
020f7150 1070 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1071 video_title = sanitize_title(video_title)
020f7150 1072
29f07568 1073 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1074 if mobj is None:
147753eb 1075 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1076 return
dbccb6cd 1077 video_uploader = mobj.group(1)
020f7150 1078
42bcd27d 1079 try:
1080 # Process video information
1081 self._downloader.process_info({
1082 'id': video_id.decode('utf-8'),
1083 'url': video_url.decode('utf-8'),
1084 'uploader': video_uploader.decode('utf-8'),
1085 'title': video_title,
1086 'stitle': simple_title,
1087 'ext': video_extension.decode('utf-8'),
6ba562b0 1088 'format': u'NA',
e616ec0c 1089 'player_url': None,
42bcd27d 1090 })
73f4e7af
RG
1091 except UnavailableVideoError:
1092 self._downloader.trouble(u'ERROR: unable to download video')
020f7150 1093
25af2bce 1094
4135fa45
WB
1095class DailymotionIE(InfoExtractor):
1096 """Information Extractor for Dailymotion"""
1097
1098 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
4135fa45
WB
1099
1100 def __init__(self, downloader=None):
1101 InfoExtractor.__init__(self, downloader)
1102
1103 @staticmethod
1104 def suitable(url):
1105 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1106
4135fa45
WB
1107 def report_download_webpage(self, video_id):
1108 """Report webpage download."""
1109 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1110
1111 def report_extraction(self, video_id):
1112 """Report information extraction."""
1113 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1114
1115 def _real_initialize(self):
1116 return
1117
4135fa45
WB
1118 def _real_extract(self, url):
1119 # Extract id and simplified title from URL
1120 mobj = re.match(self._VALID_URL, url)
1121 if mobj is None:
1122 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1123 return
1124
df372a65 1125 # At this point we have a new video
9bf7fa52 1126 self._downloader.increment_downloads()
4135fa45
WB
1127 video_id = mobj.group(1)
1128
1129 simple_title = mobj.group(2).decode('utf-8')
1130 video_extension = 'flv'
1131
1132 # Retrieve video webpage to extract further information
1133 request = urllib2.Request(url)
1134 try:
1135 self.report_download_webpage(video_id)
1136 webpage = urllib2.urlopen(request).read()
1137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1139 return
1140
1141 # Extract URL, uploader and title from webpage
1142 self.report_extraction(video_id)
1143 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1144 if mobj is None:
1145 self._downloader.trouble(u'ERROR: unable to extract media URL')
1146 return
1147 mediaURL = urllib.unquote(mobj.group(1))
1148
1149 # if needed add http://www.dailymotion.com/ if relative URL
1150
1151 video_url = mediaURL
1152
1153 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1154 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1155 if mobj is None:
1156 self._downloader.trouble(u'ERROR: unable to extract title')
1157 return
1158 video_title = mobj.group(1).decode('utf-8')
1159 video_title = sanitize_title(video_title)
1160
33407be7 1161 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
4135fa45
WB
1162 if mobj is None:
1163 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1164 return
1165 video_uploader = mobj.group(1)
1166
1167 try:
1168 # Process video information
1169 self._downloader.process_info({
1170 'id': video_id.decode('utf-8'),
1171 'url': video_url.decode('utf-8'),
1172 'uploader': video_uploader.decode('utf-8'),
1173 'title': video_title,
1174 'stitle': simple_title,
1175 'ext': video_extension.decode('utf-8'),
1176 'format': u'NA',
1177 'player_url': None,
1178 })
73f4e7af
RG
1179 except UnavailableVideoError:
1180 self._downloader.trouble(u'ERROR: unable to download video')
4135fa45 1181
49c0028a 1182class GoogleIE(InfoExtractor):
1183 """Information extractor for video.google.com."""
1184
490fd7ae 1185 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
49c0028a 1186
1187 def __init__(self, downloader=None):
1188 InfoExtractor.__init__(self, downloader)
1189
1190 @staticmethod
1191 def suitable(url):
1192 return (re.match(GoogleIE._VALID_URL, url) is not None)
1193
1194 def report_download_webpage(self, video_id):
1195 """Report webpage download."""
1196 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1197
1198 def report_extraction(self, video_id):
1199 """Report information extraction."""
1200 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1201
1202 def _real_initialize(self):
1203 return
1204
1205 def _real_extract(self, url):
1206 # Extract id from URL
1207 mobj = re.match(self._VALID_URL, url)
1208 if mobj is None:
1209 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210 return
1211
df372a65 1212 # At this point we have a new video
9bf7fa52 1213 self._downloader.increment_downloads()
49c0028a 1214 video_id = mobj.group(1)
1215
1216 video_extension = 'mp4'
1217
1218 # Retrieve video webpage to extract further information
490fd7ae 1219 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1220 try:
1221 self.report_download_webpage(video_id)
1222 webpage = urllib2.urlopen(request).read()
1223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1225 return
1226
1227 # Extract URL, uploader, and title from webpage
1228 self.report_extraction(video_id)
490fd7ae
RG
1229 mobj = re.search(r"download_url:'([^']+)'", webpage)
1230 if mobj is None:
1231 video_extension = 'flv'
1232 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1233 if mobj is None:
1234 self._downloader.trouble(u'ERROR: unable to extract media URL')
1235 return
1236 mediaURL = urllib.unquote(mobj.group(1))
1237 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1238 mediaURL = mediaURL.replace('\\x26', '\x26')
1239
1240 video_url = mediaURL
1241
1242 mobj = re.search(r'<title>(.*)</title>', webpage)
1243 if mobj is None:
1244 self._downloader.trouble(u'ERROR: unable to extract title')
1245 return
1246 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1247 video_title = sanitize_title(video_title)
31cbdaaf 1248 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1249
7e58d568
RG
1250 # Extract video description
1251 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1252 if mobj is None:
1253 self._downloader.trouble(u'ERROR: unable to extract video description')
1254 return
1255 video_description = mobj.group(1).decode('utf-8')
1256 if not video_description:
1257 video_description = 'No description available.'
1258
1259 # Extract video thumbnail
1260 if self._downloader.params.get('forcethumbnail', False):
1261 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1262 try:
1263 webpage = urllib2.urlopen(request).read()
1264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1266 return
1267 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1268 if mobj is None:
1269 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1270 return
1271 video_thumbnail = mobj.group(1)
1272 else: # we need something to pass to process_info
1273 video_thumbnail = ''
1274
1275
49c0028a 1276 try:
1277 # Process video information
1278 self._downloader.process_info({
1279 'id': video_id.decode('utf-8'),
1280 'url': video_url.decode('utf-8'),
6ba562b0 1281 'uploader': u'NA',
490fd7ae 1282 'title': video_title,
31cbdaaf 1283 'stitle': simple_title,
49c0028a 1284 'ext': video_extension.decode('utf-8'),
6ba562b0 1285 'format': u'NA',
e616ec0c 1286 'player_url': None,
49c0028a 1287 })
73f4e7af
RG
1288 except UnavailableVideoError:
1289 self._downloader.trouble(u'ERROR: unable to download video')
49c0028a 1290
1291
1292class PhotobucketIE(InfoExtractor):
1293 """Information extractor for photobucket.com."""
1294
1295 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1296
1297 def __init__(self, downloader=None):
1298 InfoExtractor.__init__(self, downloader)
1299
1300 @staticmethod
1301 def suitable(url):
1302 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1303
1304 def report_download_webpage(self, video_id):
1305 """Report webpage download."""
1306 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1307
1308 def report_extraction(self, video_id):
1309 """Report information extraction."""
1310 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1311
1312 def _real_initialize(self):
1313 return
1314
1315 def _real_extract(self, url):
1316 # Extract id from URL
1317 mobj = re.match(self._VALID_URL, url)
1318 if mobj is None:
1319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 return
1321
df372a65 1322 # At this point we have a new video
9bf7fa52 1323 self._downloader.increment_downloads()
49c0028a 1324 video_id = mobj.group(1)
1325
1326 video_extension = 'flv'
1327
1328 # Retrieve video webpage to extract further information
1329 request = urllib2.Request(url)
1330 try:
1331 self.report_download_webpage(video_id)
1332 webpage = urllib2.urlopen(request).read()
1333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1335 return
1336
1337 # Extract URL, uploader, and title from webpage
1338 self.report_extraction(video_id)
1339 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1340 if mobj is None:
1341 self._downloader.trouble(u'ERROR: unable to extract media URL')
1342 return
1343 mediaURL = urllib.unquote(mobj.group(1))
1344
1345 video_url = mediaURL
1346
1347 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1348 if mobj is None:
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1350 return
1351 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1352 video_title = sanitize_title(video_title)
31cbdaaf 1353 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1354
1355 video_uploader = mobj.group(2).decode('utf-8')
1356
1357 try:
1358 # Process video information
1359 self._downloader.process_info({
1360 'id': video_id.decode('utf-8'),
1361 'url': video_url.decode('utf-8'),
490fd7ae
RG
1362 'uploader': video_uploader,
1363 'title': video_title,
31cbdaaf 1364 'stitle': simple_title,
490fd7ae 1365 'ext': video_extension.decode('utf-8'),
6ba562b0 1366 'format': u'NA',
e616ec0c 1367 'player_url': None,
490fd7ae 1368 })
73f4e7af
RG
1369 except UnavailableVideoError:
1370 self._downloader.trouble(u'ERROR: unable to download video')
490fd7ae
RG
1371
1372
61945318
RG
1373class YahooIE(InfoExtractor):
1374 """Information extractor for video.yahoo.com."""
1375
1376 # _VALID_URL matches all Yahoo! Video URLs
1377 # _VPAGE_URL matches only the extractable '/watch/' URLs
1378 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1379 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1380
1381 def __init__(self, downloader=None):
1382 InfoExtractor.__init__(self, downloader)
1383
1384 @staticmethod
1385 def suitable(url):
1386 return (re.match(YahooIE._VALID_URL, url) is not None)
1387
1388 def report_download_webpage(self, video_id):
1389 """Report webpage download."""
1390 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1391
1392 def report_extraction(self, video_id):
1393 """Report information extraction."""
1394 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1395
1396 def _real_initialize(self):
1397 return
1398
df372a65 1399 def _real_extract(self, url, new_video=True):
61945318
RG
1400 # Extract ID from URL
1401 mobj = re.match(self._VALID_URL, url)
1402 if mobj is None:
1403 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1404 return
1405
df372a65 1406 # At this point we have a new video
9bf7fa52 1407 self._downloader.increment_downloads()
61945318
RG
1408 video_id = mobj.group(2)
1409 video_extension = 'flv'
1410
1411 # Rewrite valid but non-extractable URLs as
1412 # extractable English language /watch/ URLs
1413 if re.match(self._VPAGE_URL, url) is None:
1414 request = urllib2.Request(url)
1415 try:
1416 webpage = urllib2.urlopen(request).read()
1417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1418 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1419 return
1420
1421 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1422 if mobj is None:
1423 self._downloader.trouble(u'ERROR: Unable to extract id field')
1424 return
1425 yahoo_id = mobj.group(1)
1426
1427 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1428 if mobj is None:
1429 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1430 return
1431 yahoo_vid = mobj.group(1)
1432
1433 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1434 return self._real_extract(url, new_video=False)
61945318
RG
1435
1436 # Retrieve video webpage to extract further information
1437 request = urllib2.Request(url)
1438 try:
1439 self.report_download_webpage(video_id)
1440 webpage = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1443 return
1444
1445 # Extract uploader and title from webpage
1446 self.report_extraction(video_id)
1447 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1448 if mobj is None:
1449 self._downloader.trouble(u'ERROR: unable to extract video title')
1450 return
1451 video_title = mobj.group(1).decode('utf-8')
1452 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1453
1454 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1455 if mobj is None:
1456 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1457 return
1458 video_uploader = mobj.group(1).decode('utf-8')
1459
7e58d568
RG
1460 # Extract video thumbnail
1461 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1462 if mobj is None:
1463 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1464 return
1465 video_thumbnail = mobj.group(1).decode('utf-8')
1466
1467 # Extract video description
1468 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1469 if mobj is None:
1470 self._downloader.trouble(u'ERROR: unable to extract video description')
1471 return
1472 video_description = mobj.group(1).decode('utf-8')
1473 if not video_description: video_description = 'No description available.'
1474
61945318
RG
1475 # Extract video height and width
1476 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1477 if mobj is None:
1478 self._downloader.trouble(u'ERROR: unable to extract video height')
1479 return
1480 yv_video_height = mobj.group(1)
1481
1482 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1483 if mobj is None:
1484 self._downloader.trouble(u'ERROR: unable to extract video width')
1485 return
1486 yv_video_width = mobj.group(1)
1487
1488 # Retrieve video playlist to extract media URL
1489 # I'm not completely sure what all these options are, but we
1490 # seem to need most of them, otherwise the server sends a 401.
1491 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1492 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1493 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1494 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1495 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1496 try:
1497 self.report_download_webpage(video_id)
1498 webpage = urllib2.urlopen(request).read()
1499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1501 return
1502
1503 # Extract media URL from playlist XML
1504 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1505 if mobj is None:
1506 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1507 return
1508 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1509 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1510
1511 try:
1512 # Process video information
1513 self._downloader.process_info({
1514 'id': video_id.decode('utf-8'),
1515 'url': video_url,
1516 'uploader': video_uploader,
1517 'title': video_title,
1518 'stitle': simple_title,
1519 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1520 'thumbnail': video_thumbnail.decode('utf-8'),
1521 'description': video_description,
1522 'thumbnail': video_thumbnail,
1523 'description': video_description,
e616ec0c 1524 'player_url': None,
61945318 1525 })
73f4e7af
RG
1526 except UnavailableVideoError:
1527 self._downloader.trouble(u'ERROR: unable to download video')
61945318
RG
1528
1529
490fd7ae
RG
1530class GenericIE(InfoExtractor):
1531 """Generic last-resort information extractor."""
1532
1533 def __init__(self, downloader=None):
1534 InfoExtractor.__init__(self, downloader)
1535
1536 @staticmethod
1537 def suitable(url):
1538 return True
1539
1540 def report_download_webpage(self, video_id):
1541 """Report webpage download."""
1542 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1543 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1544
1545 def report_extraction(self, video_id):
1546 """Report information extraction."""
1547 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1548
1549 def _real_initialize(self):
1550 return
1551
1552 def _real_extract(self, url):
df372a65 1553 # At this point we have a new video
9bf7fa52 1554 self._downloader.increment_downloads()
df372a65 1555
490fd7ae
RG
1556 video_id = url.split('/')[-1]
1557 request = urllib2.Request(url)
1558 try:
1559 self.report_download_webpage(video_id)
1560 webpage = urllib2.urlopen(request).read()
1561 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1562 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1563 return
1564 except ValueError, err:
1565 # since this is the last-resort InfoExtractor, if
1566 # this error is thrown, it'll be thrown here
1567 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1568 return
1569
1570 # Start with something easy: JW Player in SWFObject
1571 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1572 if mobj is None:
1573 # Broaden the search a little bit
1574 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1575 if mobj is None:
1576 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1577 return
1578
1579 # It's possible that one of the regexes
1580 # matched, but returned an empty group:
1581 if mobj.group(1) is None:
1582 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1583 return
1584
1585 video_url = urllib.unquote(mobj.group(1))
1586 video_id = os.path.basename(video_url)
1587
1588 # here's a fun little line of code for you:
1589 video_extension = os.path.splitext(video_id)[1][1:]
1590 video_id = os.path.splitext(video_id)[0]
1591
1592 # it's tempting to parse this further, but you would
1593 # have to take into account all the variations like
1594 # Video Title - Site Name
1595 # Site Name | Video Title
1596 # Video Title - Tagline | Site Name
1597 # and so on and so forth; it's just not practical
1598 mobj = re.search(r'<title>(.*)</title>', webpage)
1599 if mobj is None:
1600 self._downloader.trouble(u'ERROR: unable to extract title')
1601 return
1602 video_title = mobj.group(1).decode('utf-8')
1603 video_title = sanitize_title(video_title)
31cbdaaf 1604 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
490fd7ae
RG
1605
1606 # video uploader is domain name
1607 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: unable to extract title')
1610 return
1611 video_uploader = mobj.group(1).decode('utf-8')
1612
1613 try:
1614 # Process video information
1615 self._downloader.process_info({
1616 'id': video_id.decode('utf-8'),
1617 'url': video_url.decode('utf-8'),
1618 'uploader': video_uploader,
1619 'title': video_title,
31cbdaaf 1620 'stitle': simple_title,
49c0028a 1621 'ext': video_extension.decode('utf-8'),
6ba562b0 1622 'format': u'NA',
e616ec0c 1623 'player_url': None,
49c0028a 1624 })
73f4e7af
RG
1625 except UnavailableVideoError, err:
1626 self._downloader.trouble(u'ERROR: unable to download video')
49c0028a 1627
1628
25af2bce
RG
1629class YoutubeSearchIE(InfoExtractor):
1630 """Information Extractor for YouTube search queries."""
1631 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1632 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1633 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 1634 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 1635 _youtube_ie = None
fd9288c3 1636 _max_youtube_results = 1000
25af2bce 1637
f995f712 1638 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
1639 InfoExtractor.__init__(self, downloader)
1640 self._youtube_ie = youtube_ie
1641
1642 @staticmethod
1643 def suitable(url):
1644 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1645
1646 def report_download_page(self, query, pagenum):
1647 """Report attempt to download playlist page with given number."""
490fd7ae 1648 query = query.decode(preferredencoding())
3aaf887e 1649 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
1650
1651 def _real_initialize(self):
1652 self._youtube_ie.initialize()
1653
1654 def _real_extract(self, query):
1655 mobj = re.match(self._VALID_QUERY, query)
1656 if mobj is None:
147753eb 1657 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 1658 return
25af2bce
RG
1659
1660 prefix, query = query.split(':')
1661 prefix = prefix[8:]
490fd7ae 1662 query = query.encode('utf-8')
f995f712 1663 if prefix == '':
6f21f686
RG
1664 self._download_n_results(query, 1)
1665 return
f995f712 1666 elif prefix == 'all':
6f21f686
RG
1667 self._download_n_results(query, self._max_youtube_results)
1668 return
f995f712 1669 else:
25af2bce 1670 try:
e1f18b8a 1671 n = long(prefix)
25af2bce 1672 if n <= 0:
147753eb 1673 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 1674 return
257453b9 1675 elif n > self._max_youtube_results:
6f21f686 1676 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 1677 n = self._max_youtube_results
6f21f686
RG
1678 self._download_n_results(query, n)
1679 return
e1f18b8a 1680 except ValueError: # parsing prefix as integer fails
6f21f686
RG
1681 self._download_n_results(query, 1)
1682 return
25af2bce
RG
1683
1684 def _download_n_results(self, query, n):
1685 """Downloads a specified number of results for a query"""
1686
1687 video_ids = []
1688 already_seen = set()
1689 pagenum = 1
1690
1691 while True:
1692 self.report_download_page(query, pagenum)
a9633f14 1693 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
25af2bce
RG
1694 request = urllib2.Request(result_url, None, std_headers)
1695 try:
1696 page = urllib2.urlopen(request).read()
1697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 1699 return
25af2bce
RG
1700
1701 # Extract video identifiers
1702 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1704 if video_id not in already_seen:
1705 video_ids.append(video_id)
1706 already_seen.add(video_id)
1707 if len(video_ids) == n:
1708 # Specified n videos reached
25af2bce 1709 for id in video_ids:
6f21f686
RG
1710 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1711 return
25af2bce 1712
304a4d85 1713 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 1714 for id in video_ids:
6f21f686
RG
1715 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1716 return
25af2bce
RG
1717
1718 pagenum = pagenum + 1
1719
7e58d568
RG
1720class GoogleSearchIE(InfoExtractor):
1721 """Information Extractor for Google Video search queries."""
1722 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1723 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1724 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1725 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1726 _google_ie = None
1727 _max_google_results = 1000
1728
1729 def __init__(self, google_ie, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1731 self._google_ie = google_ie
1732
1733 @staticmethod
1734 def suitable(url):
1735 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1736
1737 def report_download_page(self, query, pagenum):
1738 """Report attempt to download playlist page with given number."""
1739 query = query.decode(preferredencoding())
1740 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1741
1742 def _real_initialize(self):
1743 self._google_ie.initialize()
1744
1745 def _real_extract(self, query):
1746 mobj = re.match(self._VALID_QUERY, query)
1747 if mobj is None:
1748 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1749 return
1750
1751 prefix, query = query.split(':')
1752 prefix = prefix[8:]
1753 query = query.encode('utf-8')
1754 if prefix == '':
1755 self._download_n_results(query, 1)
1756 return
1757 elif prefix == 'all':
1758 self._download_n_results(query, self._max_google_results)
1759 return
1760 else:
1761 try:
1762 n = long(prefix)
1763 if n <= 0:
1764 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1765 return
1766 elif n > self._max_google_results:
1767 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1768 n = self._max_google_results
1769 self._download_n_results(query, n)
1770 return
1771 except ValueError: # parsing prefix as integer fails
1772 self._download_n_results(query, 1)
1773 return
1774
1775 def _download_n_results(self, query, n):
1776 """Downloads a specified number of results for a query"""
1777
1778 video_ids = []
1779 already_seen = set()
1780 pagenum = 1
1781
1782 while True:
1783 self.report_download_page(query, pagenum)
1784 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1785 request = urllib2.Request(result_url, None, std_headers)
1786 try:
1787 page = urllib2.urlopen(request).read()
1788 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1790 return
1791
1792 # Extract video identifiers
1793 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1794 video_id = mobj.group(1)
1795 if video_id not in already_seen:
1796 video_ids.append(video_id)
1797 already_seen.add(video_id)
1798 if len(video_ids) == n:
1799 # Specified n videos reached
1800 for id in video_ids:
1801 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1802 return
1803
1804 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1805 for id in video_ids:
1806 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1807 return
1808
1809 pagenum = pagenum + 1
1810
1811class YahooSearchIE(InfoExtractor):
1812 """Information Extractor for Yahoo! Video search queries."""
1813 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1814 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1815 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1816 _MORE_PAGES_INDICATOR = r'\s*Next'
1817 _yahoo_ie = None
1818 _max_yahoo_results = 1000
1819
1820 def __init__(self, yahoo_ie, downloader=None):
1821 InfoExtractor.__init__(self, downloader)
1822 self._yahoo_ie = yahoo_ie
1823
1824 @staticmethod
1825 def suitable(url):
1826 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1827
1828 def report_download_page(self, query, pagenum):
1829 """Report attempt to download playlist page with given number."""
1830 query = query.decode(preferredencoding())
1831 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1832
1833 def _real_initialize(self):
1834 self._yahoo_ie.initialize()
1835
1836 def _real_extract(self, query):
1837 mobj = re.match(self._VALID_QUERY, query)
1838 if mobj is None:
1839 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1840 return
1841
1842 prefix, query = query.split(':')
1843 prefix = prefix[8:]
1844 query = query.encode('utf-8')
1845 if prefix == '':
1846 self._download_n_results(query, 1)
1847 return
1848 elif prefix == 'all':
1849 self._download_n_results(query, self._max_yahoo_results)
1850 return
1851 else:
1852 try:
1853 n = long(prefix)
1854 if n <= 0:
1855 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1856 return
1857 elif n > self._max_yahoo_results:
1858 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1859 n = self._max_yahoo_results
1860 self._download_n_results(query, n)
1861 return
1862 except ValueError: # parsing prefix as integer fails
1863 self._download_n_results(query, 1)
1864 return
1865
1866 def _download_n_results(self, query, n):
1867 """Downloads a specified number of results for a query"""
1868
1869 video_ids = []
1870 already_seen = set()
1871 pagenum = 1
1872
1873 while True:
1874 self.report_download_page(query, pagenum)
1875 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1876 request = urllib2.Request(result_url, None, std_headers)
1877 try:
1878 page = urllib2.urlopen(request).read()
1879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1881 return
1882
1883 # Extract video identifiers
1884 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1885 video_id = mobj.group(1)
1886 if video_id not in already_seen:
1887 video_ids.append(video_id)
1888 already_seen.add(video_id)
1889 if len(video_ids) == n:
1890 # Specified n videos reached
1891 for id in video_ids:
1892 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1893 return
1894
1895 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1896 for id in video_ids:
1897 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1898 return
1899
1900 pagenum = pagenum + 1
1901
0c2dc87d
RG
1902class YoutubePlaylistIE(InfoExtractor):
1903 """Information Extractor for YouTube playlists."""
1904
9177ce4d 1905 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
72ac78b8 1906 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
0c2dc87d 1907 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 1908 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d
RG
1909 _youtube_ie = None
1910
1911 def __init__(self, youtube_ie, downloader=None):
1912 InfoExtractor.__init__(self, downloader)
1913 self._youtube_ie = youtube_ie
1914
1915 @staticmethod
1916 def suitable(url):
1917 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1918
1919 def report_download_page(self, playlist_id, pagenum):
1920 """Report attempt to download playlist page with given number."""
3aaf887e 1921 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
1922
1923 def _real_initialize(self):
1924 self._youtube_ie.initialize()
1925
1926 def _real_extract(self, url):
1927 # Extract playlist id
1928 mobj = re.match(self._VALID_URL, url)
1929 if mobj is None:
147753eb 1930 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 1931 return
0c2dc87d
RG
1932
1933 # Download playlist pages
1934 playlist_id = mobj.group(1)
1935 video_ids = []
1936 pagenum = 1
1937
1938 while True:
1939 self.report_download_page(playlist_id, pagenum)
1940 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1941 try:
1942 page = urllib2.urlopen(request).read()
1943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1944 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 1945 return
0c2dc87d
RG
1946
1947 # Extract video identifiers
27d98b6e 1948 ids_in_page = []
0c2dc87d 1949 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
1950 if mobj.group(1) not in ids_in_page:
1951 ids_in_page.append(mobj.group(1))
1952 video_ids.extend(ids_in_page)
0c2dc87d 1953
ce5cafea 1954 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
1955 break
1956 pagenum = pagenum + 1
1957
204c9398
RG
1958 playliststart = self._downloader.params.get('playliststart', 1)
1959 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1960 if playliststart > 0:
1961 video_ids = video_ids[playliststart:]
1962
0c2dc87d 1963 for id in video_ids:
6f21f686
RG
1964 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1965 return
0c2dc87d 1966
c39c05cd
A
1967class YoutubeUserIE(InfoExtractor):
1968 """Information Extractor for YouTube users."""
1969
1970 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1971 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
75a4cf3c 1972 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
c39c05cd
A
1973 _youtube_ie = None
1974
1975 def __init__(self, youtube_ie, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1977 self._youtube_ie = youtube_ie
1978
1979 @staticmethod
1980 def suitable(url):
1981 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1982
1983 def report_download_page(self, username):
1984 """Report attempt to download user page."""
75a4cf3c 1985 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
c39c05cd
A
1986
1987 def _real_initialize(self):
1988 self._youtube_ie.initialize()
1989
1990 def _real_extract(self, url):
1991 # Extract username
1992 mobj = re.match(self._VALID_URL, url)
1993 if mobj is None:
1994 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995 return
1996
1997 # Download user page
1998 username = mobj.group(1)
1999 video_ids = []
2000 pagenum = 1
2001
2002 self.report_download_page(username)
2003 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2004 try:
2005 page = urllib2.urlopen(request).read()
2006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2008 return
2009
2010 # Extract video identifiers
2011 ids_in_page = []
2012
2013 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
c39c05cd
A
2014 if mobj.group(1) not in ids_in_page:
2015 ids_in_page.append(mobj.group(1))
2016 video_ids.extend(ids_in_page)
2017
204c9398
RG
2018 playliststart = self._downloader.params.get('playliststart', 1)
2019 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2020 if playliststart > 0:
2021 video_ids = video_ids[playliststart:]
2022
c39c05cd
A
2023 for id in video_ids:
2024 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2025 return
2026
65cd34c5
RG
2027class PostProcessor(object):
2028 """Post Processor class.
2029
2030 PostProcessor objects can be added to downloaders with their
2031 add_post_processor() method. When the downloader has finished a
2032 successful download, it will take its internal chain of PostProcessors
2033 and start calling the run() method on each one of them, first with
2034 an initial argument and then with the returned value of the previous
2035 PostProcessor.
2036
2037 The chain will be stopped if one of them ever returns None or the end
2038 of the chain is reached.
2039
2040 PostProcessor objects follow a "mutual registration" process similar
2041 to InfoExtractor objects.
2042 """
2043
2044 _downloader = None
2045
2046 def __init__(self, downloader=None):
2047 self._downloader = downloader
2048
65cd34c5
RG
2049 def set_downloader(self, downloader):
2050 """Sets the downloader for this PP."""
2051 self._downloader = downloader
2052
2053 def run(self, information):
2054 """Run the PostProcessor.
2055
2056 The "information" argument is a dictionary like the ones
2f11508a 2057 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
2058 one has an extra field called "filepath" that points to the
2059 downloaded file.
2060
2061 When this method returns None, the postprocessing chain is
2062 stopped. However, this method may return an information
2063 dictionary that will be passed to the next postprocessing
2064 object in the chain. It can be the one it received after
2065 changing some fields.
2066
2067 In addition, this method may raise a PostProcessingError
2068 exception that will be taken into account by the downloader
2069 it was called from.
2070 """
2071 return information # by default, do nothing
2072
2073### MAIN PROGRAM ###
4fa74b52
RG
2074if __name__ == '__main__':
2075 try:
f9f1e798 2076 # Modules needed only when running the main program
209e9e27 2077 import getpass
f9f1e798
RG
2078 import optparse
2079
4bec29ef
RG
2080 # Function to update the program file with the latest version from bitbucket.org
2081 def update_self(downloader, filename):
2082 # Note: downloader only used for options
2083 if not os.access (filename, os.W_OK):
2084 sys.exit('ERROR: no write permissions on %s' % filename)
2085
2086 downloader.to_stdout('Updating to latest stable version...')
2087 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2088 latest_version = urllib.urlopen(latest_url).read().strip()
2089 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2090 newcontent = urllib.urlopen(prog_url).read()
2091 stream = open(filename, 'w')
2092 stream.write(newcontent)
2093 stream.close()
2094 downloader.to_stdout('Updated to version %s' % latest_version)
2095
4fa74b52
RG
2096 # General configuration
2097 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2098 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
f9f1e798
RG
2099 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2100
2101 # Parse command line
209e9e27 2102 parser = optparse.OptionParser(
7b7759f5 2103 usage='Usage: %prog [options] url...',
80cc2330 2104 version='2010.08.04',
7b7759f5 2105 conflict_handler='resolve',
2106 )
2107
209e9e27
RG
2108 parser.add_option('-h', '--help',
2109 action='help', help='print this help text and exit')
2110 parser.add_option('-v', '--version',
2111 action='version', help='print program version and exit')
4bec29ef
RG
2112 parser.add_option('-U', '--update',
2113 action='store_true', dest='update_self', help='update this program to latest stable version')
7b7759f5 2114 parser.add_option('-i', '--ignore-errors',
2115 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2116 parser.add_option('-r', '--rate-limit',
2b06c33d 2117 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
7031008c 2118 parser.add_option('-R', '--retries',
2b06c33d 2119 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
204c9398
RG
2120 parser.add_option('--playlist-start',
2121 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
7b7759f5 2122
2123 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2124 authentication.add_option('-u', '--username',
2b06c33d 2125 dest='username', metavar='USERNAME', help='account username')
7b7759f5 2126 authentication.add_option('-p', '--password',
2b06c33d 2127 dest='password', metavar='PASSWORD', help='account password')
7b7759f5 2128 authentication.add_option('-n', '--netrc',
209e9e27 2129 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
7b7759f5 2130 parser.add_option_group(authentication)
2131
2132 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2133 video_format.add_option('-f', '--format',
2b06c33d 2134 action='store', dest='format', metavar='FORMAT', help='video format code')
7b7759f5 2135 video_format.add_option('-m', '--mobile-version',
b74c859d 2136 action='store_const', dest='format', help='alias for -f 17', const='17')
6ba562b0
RG
2137 video_format.add_option('--all-formats',
2138 action='store_const', dest='format', help='download all available video formats', const='-1')
f2413e67 2139 video_format.add_option('--max-quality',
460d8acb 2140 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2962317d
RG
2141 video_format.add_option('-b', '--best-quality',
2142 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
7b7759f5 2143 parser.add_option_group(video_format)
2144
2145 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2146 verbosity.add_option('-q', '--quiet',
2147 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2148 verbosity.add_option('-s', '--simulate',
2149 action='store_true', dest='simulate', help='do not download video', default=False)
2150 verbosity.add_option('-g', '--get-url',
2151 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2152 verbosity.add_option('-e', '--get-title',
2153 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
7e58d568
RG
2154 verbosity.add_option('--get-thumbnail',
2155 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2156 verbosity.add_option('--get-description',
2157 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
d9835247
RG
2158 verbosity.add_option('--no-progress',
2159 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
7b7759f5 2160 parser.add_option_group(verbosity)
2161
2162 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1c76e23e
RG
2163 filesystem.add_option('-t', '--title',
2164 action='store_true', dest='usetitle', help='use title in file name', default=False)
2165 filesystem.add_option('-l', '--literal',
2166 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
7b7759f5 2167 filesystem.add_option('-o', '--output',
2b06c33d 2168 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
7b7759f5 2169 filesystem.add_option('-a', '--batch-file',
2b06c33d 2170 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
7b7759f5 2171 filesystem.add_option('-w', '--no-overwrites',
0beeff4b 2172 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
f76c2df6
PI
2173 filesystem.add_option('-c', '--continue',
2174 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
7b7759f5 2175 parser.add_option_group(filesystem)
2176
209e9e27 2177 (opts, args) = parser.parse_args()
2a7353b8 2178
c6fd0bb8 2179 # Batch file verification
d1580ed9 2180 batchurls = []
c6fd0bb8
RG
2181 if opts.batchfile is not None:
2182 try:
2a7353b8
RG
2183 if opts.batchfile == '-':
2184 batchfd = sys.stdin
2185 else:
2186 batchfd = open(opts.batchfile, 'r')
2187 batchurls = batchfd.readlines()
b65740e4
RG
2188 batchurls = [x.strip() for x in batchurls]
2189 batchurls = [x for x in batchurls if len(x) > 0]
c6fd0bb8
RG
2190 except IOError:
2191 sys.exit(u'ERROR: batch file could not be read')
2192 all_urls = batchurls + args
2193
209e9e27 2194 # Conflicting, missing and erroneous options
2962317d
RG
2195 if opts.bestquality:
2196 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
209e9e27 2197 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2740c509 2198 parser.error(u'using .netrc conflicts with giving username/password')
209e9e27 2199 if opts.password is not None and opts.username is None:
2740c509 2200 parser.error(u'account username missing')
209e9e27 2201 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2740c509 2202 parser.error(u'using output template conflicts with using title or literal title')
209e9e27 2203 if opts.usetitle and opts.useliteral:
2740c509 2204 parser.error(u'using title conflicts with using literal title')
209e9e27 2205 if opts.username is not None and opts.password is None:
76a7f364 2206 opts.password = getpass.getpass(u'Type account password and press return:')
acd3d842
RG
2207 if opts.ratelimit is not None:
2208 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2209 if numeric_limit is None:
2740c509 2210 parser.error(u'invalid rate limit specified')
acd3d842 2211 opts.ratelimit = numeric_limit
7031008c
RG
2212 if opts.retries is not None:
2213 try:
2214 opts.retries = long(opts.retries)
2215 except (TypeError, ValueError), err:
2216 parser.error(u'invalid retry count specified')
204c9398
RG
2217 if opts.playliststart is not None:
2218 try:
2219 opts.playliststart = long(opts.playliststart)
2220 except (TypeError, ValueError), err:
2221 parser.error(u'invalid playlist page specified')
4fa74b52
RG
2222
2223 # Information extractors
2224 youtube_ie = YoutubeIE()
020f7150 2225 metacafe_ie = MetacafeIE(youtube_ie)
4135fa45 2226 dailymotion_ie = DailymotionIE()
0c2dc87d 2227 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
c39c05cd 2228 youtube_user_ie = YoutubeUserIE(youtube_ie)
25af2bce 2229 youtube_search_ie = YoutubeSearchIE(youtube_ie)
49c0028a 2230 google_ie = GoogleIE()
7e58d568 2231 google_search_ie = GoogleSearchIE(google_ie)
49c0028a 2232 photobucket_ie = PhotobucketIE()
61945318 2233 yahoo_ie = YahooIE()
7e58d568 2234 yahoo_search_ie = YahooSearchIE(yahoo_ie)
490fd7ae 2235 generic_ie = GenericIE()
4fa74b52
RG
2236
2237 # File downloader
9fcd8355 2238 fd = FileDownloader({
209e9e27
RG
2239 'usenetrc': opts.usenetrc,
2240 'username': opts.username,
2241 'password': opts.password,
7e58d568 2242 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
209e9e27
RG
2243 'forceurl': opts.geturl,
2244 'forcetitle': opts.gettitle,
7e58d568
RG
2245 'forcethumbnail': opts.getthumbnail,
2246 'forcedescription': opts.getdescription,
2247 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
320becd6 2248 'format': opts.format,
f2413e67 2249 'format_limit': opts.format_limit,
eae2666c 2250 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
6ba562b0
RG
2251 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2252 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2253 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
76a7f364
RG
2254 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2255 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2256 or u'%(id)s.%(ext)s'),
0086d1ec 2257 'ignoreerrors': opts.ignoreerrors,
acd3d842 2258 'ratelimit': opts.ratelimit,
0beeff4b 2259 'nooverwrites': opts.nooverwrites,
7031008c 2260 'retries': opts.retries,
7db85b2c 2261 'continuedl': opts.continue_dl,
d9835247 2262 'noprogress': opts.noprogress,
204c9398 2263 'playliststart': opts.playliststart,
9fcd8355 2264 })
25af2bce 2265 fd.add_info_extractor(youtube_search_ie)
0c2dc87d 2266 fd.add_info_extractor(youtube_pl_ie)
c39c05cd 2267 fd.add_info_extractor(youtube_user_ie)
020f7150 2268 fd.add_info_extractor(metacafe_ie)
4135fa45 2269 fd.add_info_extractor(dailymotion_ie)
4fa74b52 2270 fd.add_info_extractor(youtube_ie)
49c0028a 2271 fd.add_info_extractor(google_ie)
7e58d568 2272 fd.add_info_extractor(google_search_ie)
49c0028a 2273 fd.add_info_extractor(photobucket_ie)
61945318 2274 fd.add_info_extractor(yahoo_ie)
7e58d568 2275 fd.add_info_extractor(yahoo_search_ie)
4bec29ef 2276
490fd7ae
RG
2277 # This must come last since it's the
2278 # fallback if none of the others work
2279 fd.add_info_extractor(generic_ie)
2280
4bec29ef
RG
2281 # Update version
2282 if opts.update_self:
2283 update_self(fd, sys.argv[0])
2284
2285 # Maybe do nothing
2286 if len(all_urls) < 1:
2287 if not opts.update_self:
2288 parser.error(u'you must provide at least one URL')
2289 else:
2290 sys.exit()
c6fd0bb8 2291 retcode = fd.download(all_urls)
bb681b88 2292 sys.exit(retcode)
4fa74b52 2293
e5bf0f55
RG
2294 except DownloadError:
2295 sys.exit(1)
2296 except SameFileError:
76a7f364 2297 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 2298 except KeyboardInterrupt:
76a7f364 2299 sys.exit(u'\nERROR: Interrupted by user')