]> jfr.im git - yt-dlp.git/blame - youtube-dl
Add vim modeline
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Author: Ricardo Garcia Gonzalez
64a6f26c 4# Author: Danny Colligan
49c0028a 5# Author: Benjamin Johnson
e567ef93 6# Author: Vasyl' Vavrychuk
0f7099a5 7# Author: Witold Baryluk
5aba6ea4 8# Author: Paweł Paprota
ef9f8451 9# Author: Gergely Imreh
4fa74b52 10# License: Public domain code
80066952 11import cookielib
ccbd296b 12import ctypes
a1f03c7b 13import datetime
09bd408c 14import email.utils
1987c232 15import gzip
4fa74b52
RG
16import htmlentitydefs
17import httplib
2546e767 18import locale
4fa74b52
RG
19import math
20import netrc
21import os
22import os.path
23import re
24import socket
25import string
1987c232 26import StringIO
0487b407 27import subprocess
4fa74b52
RG
28import sys
29import time
30import urllib
31import urllib2
1987c232 32import zlib
a04e80a4
RG
33
34# parse_qs was moved from the cgi module to the urlparse module recently.
35try:
36 from urlparse import parse_qs
37except ImportError:
38 from cgi import parse_qs
4fa74b52 39
f995f712 40std_headers = {
c44b9ee9 41 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
4fa74b52 42 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 43 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
a57ed21f 44 'Accept-Encoding': 'gzip, deflate',
4fa74b52
RG
45 'Accept-Language': 'en-us,en;q=0.5',
46}
47
48simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
49
eae2666c
RG
50def preferredencoding():
51 """Get preferred encoding.
52
53 Returns the best encoding scheme for the system, based on
54 locale.getpreferredencoding() and some further tweaks.
55 """
f94b636c
RG
56 def yield_preferredencoding():
57 try:
58 pref = locale.getpreferredencoding()
59 u'TEST'.encode(pref)
60 except:
61 pref = 'UTF-8'
62 while True:
63 yield pref
64 return yield_preferredencoding().next()
eae2666c 65
490fd7ae
RG
66def htmlentity_transform(matchobj):
67 """Transforms an HTML entity to a Unicode character.
d3975459 68
490fd7ae
RG
69 This function receives a match object and is intended to be used with
70 the re.sub() function.
71 """
72 entity = matchobj.group(1)
73
74 # Known non-numeric HTML entity
75 if entity in htmlentitydefs.name2codepoint:
76 return unichr(htmlentitydefs.name2codepoint[entity])
77
78 # Unicode character
79 mobj = re.match(ur'(?u)#(x?\d+)', entity)
80 if mobj is not None:
81 numstr = mobj.group(1)
82 if numstr.startswith(u'x'):
83 base = 16
84 numstr = u'0%s' % numstr
85 else:
86 base = 10
87 return unichr(long(numstr, base))
88
89 # Unknown entity in name, return its literal representation
90 return (u'&%s;' % entity)
91
92def sanitize_title(utitle):
31bcb480 93 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 94 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
95 return utitle.replace(unicode(os.sep), u'%')
96
31bcb480
RG
97def sanitize_open(filename, open_mode):
98 """Try to open the given filename, and slightly tweak it if this fails.
99
100 Attempts to open the given filename. If this fails, it tries to change
101 the filename slightly, step by step, until it's either able to open it
102 or it fails and raises a final exception, like the standard open()
103 function.
104
105 It returns the tuple (stream, definitive_file_name).
106 """
107 try:
131bc765 108 if filename == u'-':
e08878f4
RG
109 if sys.platform == 'win32':
110 import msvcrt
111 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 112 return (sys.stdout, filename)
31bcb480
RG
113 stream = open(filename, open_mode)
114 return (stream, filename)
115 except (IOError, OSError), err:
116 # In case of error, try to remove win32 forbidden chars
ca6a11fa 117 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
118
119 # An exception here should be caught in the caller
120 stream = open(filename, open_mode)
121 return (stream, filename)
122
09bd408c
GI
123def timeconvert(timestr):
124 """Convert RFC 2822 defined time string into system timestamp"""
125 timestamp = None
126 timetuple = email.utils.parsedate_tz(timestr)
127 if timetuple is not None:
128 timestamp = email.utils.mktime_tz(timetuple)
129 return timestamp
130
e5bf0f55
RG
131class DownloadError(Exception):
132 """Download Error exception.
d3975459 133
e5bf0f55
RG
134 This exception may be thrown by FileDownloader objects if they are not
135 configured to continue on errors. They will contain the appropriate
136 error message.
137 """
138 pass
139
140class SameFileError(Exception):
141 """Same File exception.
142
143 This exception will be thrown by FileDownloader objects if they detect
144 multiple files would have to be downloaded to the same file on disk.
145 """
146 pass
147
65cd34c5
RG
148class PostProcessingError(Exception):
149 """Post Processing exception.
150
151 This exception may be raised by PostProcessor's .run() method to
152 indicate an error in the postprocessing task.
153 """
154 pass
155
73f4e7af 156class UnavailableVideoError(Exception):
7b7759f5 157 """Unavailable Format exception.
158
159 This exception will be thrown when a video is requested
160 in a format that is not available for that video.
161 """
d69a1c91
RG
162 pass
163
164class ContentTooShortError(Exception):
165 """Content Too Short exception.
166
167 This exception may be raised by FileDownloader objects when a file they
168 download is too small for what the server announced first, indicating
169 the connection was probably interrupted.
170 """
171 # Both in bytes
172 downloaded = None
173 expected = None
174
175 def __init__(self, downloaded, expected):
176 self.downloaded = downloaded
177 self.expected = expected
7b7759f5 178
1987c232
RG
179class YoutubeDLHandler(urllib2.HTTPHandler):
180 """Handler for HTTP requests and responses.
181
182 This class, when installed with an OpenerDirector, automatically adds
183 the standard headers to every HTTP request and handles gzipped and
184 deflated responses from web servers. If compression is to be avoided in
185 a particular request, the original request in the program code only has
186 to include the HTTP header "Youtubedl-No-Compression", which will be
187 removed before making the real request.
188
189 Part of this code was copied from:
190
191 http://techknack.net/python-urllib2-handlers/
192
193 Andrew Rowls, the author of that code, agreed to release it to the
194 public domain.
195 """
196
197 @staticmethod
198 def deflate(data):
199 try:
200 return zlib.decompress(data, -zlib.MAX_WBITS)
201 except zlib.error:
202 return zlib.decompress(data)
203
7b531c0b
RG
204 @staticmethod
205 def addinfourl_wrapper(stream, headers, url, code):
206 if hasattr(urllib2.addinfourl, 'getcode'):
207 return urllib2.addinfourl(stream, headers, url, code)
0f6b00b5
RG
208 ret = urllib2.addinfourl(stream, headers, url)
209 ret.code = code
210 return ret
7b531c0b 211
1987c232
RG
212 def http_request(self, req):
213 for h in std_headers:
214 if h in req.headers:
215 del req.headers[h]
216 req.add_header(h, std_headers[h])
217 if 'Youtubedl-no-compression' in req.headers:
218 if 'Accept-encoding' in req.headers:
219 del req.headers['Accept-encoding']
220 del req.headers['Youtubedl-no-compression']
221 return req
222
223 def http_response(self, req, resp):
224 old_resp = resp
225 # gzip
226 if resp.headers.get('Content-encoding', '') == 'gzip':
227 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
7b531c0b 228 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
229 resp.msg = old_resp.msg
230 # deflate
231 if resp.headers.get('Content-encoding', '') == 'deflate':
232 gz = StringIO.StringIO(self.deflate(resp.read()))
7b531c0b 233 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
234 resp.msg = old_resp.msg
235 return resp
236
4fa74b52
RG
237class FileDownloader(object):
238 """File Downloader class.
239
240 File downloader objects are the ones responsible of downloading the
241 actual video file and writing it to disk if the user has requested
242 it, among some other tasks. In most cases there should be one per
243 program. As, given a video URL, the downloader doesn't know how to
244 extract all the needed information, task that InfoExtractors do, it
245 has to pass the URL to one of them.
246
247 For this, file downloader objects have a method that allows
248 InfoExtractors to be registered in a given order. When it is passed
249 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
250 finds that reports being able to handle it. The InfoExtractor extracts
251 all the information about the video or videos the URL refers to, and
252 asks the FileDownloader to process the video information, possibly
253 downloading the video.
4fa74b52
RG
254
255 File downloaders accept a lot of parameters. In order not to saturate
256 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
257 options instead. These options are available through the params
258 attribute for the InfoExtractors to use. The FileDownloader also
259 registers itself as the downloader in charge for the InfoExtractors
260 that are added to it, so this is a "mutual registration".
4fa74b52
RG
261
262 Available options:
263
80066952
RG
264 username: Username for authentication purposes.
265 password: Password for authentication purposes.
266 usenetrc: Use netrc for authentication instead.
267 quiet: Do not print messages to stdout.
268 forceurl: Force printing final URL.
269 forcetitle: Force printing title.
270 forcethumbnail: Force printing thumbnail URL.
271 forcedescription: Force printing description.
9f796346 272 forcefilename: Force printing final filename.
80066952
RG
273 simulate: Do not download the video files.
274 format: Video format code.
275 format_limit: Highest quality format to try.
276 outtmpl: Template for output names.
277 ignoreerrors: Do not stop on download errors.
278 ratelimit: Download speed limit, in bytes/sec.
279 nooverwrites: Prevent overwriting files.
280 retries: Number of times to retry for HTTP error 5xx
281 continuedl: Try to continue downloads if possible.
282 noprogress: Do not print the progress bar.
283 playliststart: Playlist item to start at.
8cc44341 284 playlistend: Playlist item to end at.
331ce0a0 285 logtostderr: Log messages to stderr instead of stdout.
ccbd296b 286 consoletitle: Display progress in console window's titlebar.
3fb2c487 287 nopart: Do not use temporary .part files.
e3018902 288 updatetime: Use the Last-modified header to set output file timestamps.
4fa74b52
RG
289 """
290
d0a9affb 291 params = None
4fa74b52 292 _ies = []
65cd34c5 293 _pps = []
9bf386d7 294 _download_retcode = None
7d8d0612 295 _num_downloads = None
331ce0a0 296 _screen_file = None
4fa74b52
RG
297
298 def __init__(self, params):
1c5e2302 299 """Create a FileDownloader object with the given options."""
4fa74b52 300 self._ies = []
65cd34c5 301 self._pps = []
9bf386d7 302 self._download_retcode = 0
7d8d0612 303 self._num_downloads = 0
331ce0a0 304 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 305 self.params = params
d3975459 306
4fa74b52
RG
307 @staticmethod
308 def pmkdir(filename):
309 """Create directory components in filename. Similar to Unix "mkdir -p"."""
310 components = filename.split(os.sep)
311 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
3af1e172 312 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
4fa74b52
RG
313 for dir in aggregate:
314 if not os.path.exists(dir):
315 os.mkdir(dir)
3fb2c487 316
4fa74b52
RG
317 @staticmethod
318 def format_bytes(bytes):
319 if bytes is None:
320 return 'N/A'
8497c36d
RG
321 if type(bytes) is str:
322 bytes = float(bytes)
323 if bytes == 0.0:
4fa74b52
RG
324 exponent = 0
325 else:
8497c36d 326 exponent = long(math.log(bytes, 1024.0))
4fa74b52 327 suffix = 'bkMGTPEZY'[exponent]
4fa74b52
RG
328 converted = float(bytes) / float(1024**exponent)
329 return '%.2f%s' % (converted, suffix)
330
331 @staticmethod
332 def calc_percent(byte_counter, data_len):
333 if data_len is None:
334 return '---.-%'
335 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
336
337 @staticmethod
338 def calc_eta(start, now, total, current):
339 if total is None:
340 return '--:--'
341 dif = now - start
342 if current == 0 or dif < 0.001: # One millisecond
343 return '--:--'
344 rate = float(current) / dif
345 eta = long((float(total) - float(current)) / rate)
346 (eta_mins, eta_secs) = divmod(eta, 60)
347 if eta_mins > 99:
348 return '--:--'
349 return '%02d:%02d' % (eta_mins, eta_secs)
350
5121ef20 351 @staticmethod
4fa74b52
RG
352 def calc_speed(start, now, bytes):
353 dif = now - start
354 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 355 return '%10s' % '---b/s'
4fa74b52
RG
356 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
357
358 @staticmethod
359 def best_block_size(elapsed_time, bytes):
360 new_min = max(bytes / 2.0, 1.0)
361 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
362 if elapsed_time < 0.001:
e1f18b8a 363 return long(new_max)
4fa74b52
RG
364 rate = bytes / elapsed_time
365 if rate > new_max:
e1f18b8a 366 return long(new_max)
4fa74b52 367 if rate < new_min:
e1f18b8a
RG
368 return long(new_min)
369 return long(rate)
4fa74b52 370
acd3d842
RG
371 @staticmethod
372 def parse_bytes(bytestr):
373 """Parse a string indicating a byte quantity into a long integer."""
374 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
375 if matchobj is None:
376 return None
377 number = float(matchobj.group(1))
378 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
379 return long(round(number * multiplier))
380
4fa74b52
RG
381 def add_info_extractor(self, ie):
382 """Add an InfoExtractor object to the end of the list."""
383 self._ies.append(ie)
384 ie.set_downloader(self)
d3975459 385
65cd34c5
RG
386 def add_post_processor(self, pp):
387 """Add a PostProcessor object to the end of the chain."""
388 self._pps.append(pp)
389 pp.set_downloader(self)
d3975459 390
331ce0a0 391 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 392 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
393 try:
394 if not self.params.get('quiet', False):
331ce0a0
RG
395 terminator = [u'\n', u''][skip_eol]
396 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
397 self._screen_file.flush()
43ab0ca4
RG
398 except (UnicodeEncodeError), err:
399 if not ignore_encoding_errors:
400 raise
d3975459 401
7e5cab67
RG
402 def to_stderr(self, message):
403 """Print message to stderr."""
eae2666c 404 print >>sys.stderr, message.encode(preferredencoding())
d3975459 405
ccbd296b
MM
406 def to_cons_title(self, message):
407 """Set console/terminal window title to message."""
408 if not self.params.get('consoletitle', False):
409 return
410 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
411 # c_wchar_p() might not be necessary if `message` is
412 # already of type unicode()
413 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
414 elif 'TERM' in os.environ:
415 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
416
22899cea
RG
417 def fixed_template(self):
418 """Checks if the output template is fixed."""
d0a9affb 419 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 420
0086d1ec
RG
421 def trouble(self, message=None):
422 """Determine action to take when a download problem appears.
423
424 Depending on if the downloader has been configured to ignore
e5bf0f55 425 download errors or not, this method may throw an exception or
9bf386d7 426 not when errors are found, after printing the message.
0086d1ec
RG
427 """
428 if message is not None:
429 self.to_stderr(message)
d0a9affb 430 if not self.params.get('ignoreerrors', False):
e5bf0f55 431 raise DownloadError(message)
9bf386d7 432 self._download_retcode = 1
0086d1ec 433
acd3d842
RG
434 def slow_down(self, start_time, byte_counter):
435 """Sleep if the download speed is over the rate limit."""
d0a9affb 436 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
437 if rate_limit is None or byte_counter == 0:
438 return
439 now = time.time()
440 elapsed = now - start_time
441 if elapsed <= 0.0:
442 return
443 speed = float(byte_counter) / elapsed
444 if speed > rate_limit:
445 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
3fb2c487
RG
446
447 def temp_name(self, filename):
448 """Returns a temporary filename for the given filename."""
449 if self.params.get('nopart', False) or filename == u'-' or \
450 (os.path.exists(filename) and not os.path.isfile(filename)):
451 return filename
452 return filename + u'.part'
453
8cc42e7c
RG
454 def undo_temp_name(self, filename):
455 if filename.endswith(u'.part'):
456 return filename[:-len(u'.part')]
457 return filename
458
62cf7aaf
RG
459 def try_rename(self, old_filename, new_filename):
460 try:
7d950ca1
RG
461 if old_filename == new_filename:
462 return
62cf7aaf
RG
463 os.rename(old_filename, new_filename)
464 except (IOError, OSError), err:
465 self.trouble(u'ERROR: unable to rename file')
e3018902
RG
466
467 def try_utime(self, filename, last_modified_hdr):
468 """Try to set the last-modified time of the given file."""
469 if last_modified_hdr is None:
470 return
471 if not os.path.isfile(filename):
472 return
473 timestr = last_modified_hdr
474 if timestr is None:
475 return
476 filetime = timeconvert(timestr)
477 if filetime is None:
478 return
479 try:
480 os.utime(filename,(time.time(), filetime))
481 except:
482 pass
acd3d842 483
bafa5cd9
RG
484 def report_destination(self, filename):
485 """Report destination filename."""
331ce0a0 486 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
d3975459 487
bafa5cd9
RG
488 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
489 """Report download progress."""
d9835247
RG
490 if self.params.get('noprogress', False):
491 return
331ce0a0 492 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 493 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
ccbd296b
MM
494 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
495 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
7db85b2c
RG
496
497 def report_resuming_byte(self, resume_len):
8a9f53be 498 """Report attempt to resume at given byte."""
331ce0a0 499 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
d3975459 500
7031008c 501 def report_retry(self, count, retries):
e86e9474 502 """Report retry in case of HTTP error 5xx"""
331ce0a0 503 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
d3975459 504
7db85b2c
RG
505 def report_file_already_downloaded(self, file_name):
506 """Report file has already been fully downloaded."""
43ab0ca4 507 try:
331ce0a0 508 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 509 except (UnicodeEncodeError), err:
331ce0a0 510 self.to_screen(u'[download] The file has already been downloaded')
d3975459 511
7db85b2c
RG
512 def report_unable_to_resume(self):
513 """Report it was impossible to resume download."""
331ce0a0 514 self.to_screen(u'[download] Unable to resume')
d3975459 515
bafa5cd9
RG
516 def report_finish(self):
517 """Report download finished."""
d9835247 518 if self.params.get('noprogress', False):
331ce0a0 519 self.to_screen(u'[download] Download completed')
d9835247 520 else:
331ce0a0 521 self.to_screen(u'')
d3975459 522
df372a65
RG
523 def increment_downloads(self):
524 """Increment the ordinal that assigns a number to each file."""
525 self._num_downloads += 1
bafa5cd9 526
9f796346
GI
527 def prepare_filename(self, info_dict):
528 """Generate the output filename."""
529 try:
530 template_dict = dict(info_dict)
531 template_dict['epoch'] = unicode(long(time.time()))
532 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
533 filename = self.params['outtmpl'] % template_dict
534 return filename
535 except (ValueError, KeyError), err:
536 self.trouble(u'ERROR: invalid system charset or erroneous output template')
537 return None
538
c8619e01
RG
539 def process_info(self, info_dict):
540 """Process a single dictionary returned by an InfoExtractor."""
9f796346 541 filename = self.prepare_filename(info_dict)
c8619e01
RG
542 # Do nothing else if in simulate mode
543 if self.params.get('simulate', False):
cbfff4db
RG
544 # Forced printings
545 if self.params.get('forcetitle', False):
490fd7ae 546 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 547 if self.params.get('forceurl', False):
490fd7ae 548 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
7e58d568
RG
549 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
550 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
551 if self.params.get('forcedescription', False) and 'description' in info_dict:
552 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
9f796346
GI
553 if self.params.get('forcefilename', False) and filename is not None:
554 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 555
9bf386d7 556 return
d3975459 557
9f796346 558 if filename is None:
38ed1344 559 return
850ab765 560 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 561 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 562 return
7b7759f5 563
c8619e01
RG
564 try:
565 self.pmkdir(filename)
566 except (OSError, IOError), err:
db7e31b8 567 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
9bf386d7 568 return
7b7759f5 569
c8619e01 570 try:
e616ec0c 571 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
c8619e01 572 except (OSError, IOError), err:
73f4e7af 573 raise UnavailableVideoError
c8619e01 574 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
db7e31b8 575 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
9bf386d7 576 return
d69a1c91 577 except (ContentTooShortError, ), err:
db7e31b8 578 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
d69a1c91 579 return
7b7759f5 580
55e7c75e
RG
581 if success:
582 try:
583 self.post_process(filename, info_dict)
584 except (PostProcessingError), err:
db7e31b8 585 self.trouble(u'ERROR: postprocessing: %s' % str(err))
55e7c75e 586 return
c8619e01 587
4fa74b52
RG
588 def download(self, url_list):
589 """Download a given list of URLs."""
22899cea 590 if len(url_list) > 1 and self.fixed_template():
d0a9affb 591 raise SameFileError(self.params['outtmpl'])
22899cea 592
4fa74b52
RG
593 for url in url_list:
594 suitable_found = False
595 for ie in self._ies:
c8619e01 596 # Go to next InfoExtractor if not suitable
4fa74b52
RG
597 if not ie.suitable(url):
598 continue
c8619e01 599
4fa74b52
RG
600 # Suitable InfoExtractor found
601 suitable_found = True
c8619e01 602
6f21f686
RG
603 # Extract information from URL and process it
604 ie.extract(url)
65cd34c5 605
c8619e01 606 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 607 break
c8619e01 608
4fa74b52 609 if not suitable_found:
db7e31b8 610 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 611
9bf386d7 612 return self._download_retcode
65cd34c5
RG
613
614 def post_process(self, filename, ie_info):
615 """Run the postprocessing chain on the given file."""
616 info = dict(ie_info)
617 info['filepath'] = filename
618 for pp in self._pps:
619 info = pp.run(info)
620 if info is None:
621 break
d3975459 622
e616ec0c 623 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407 624 self.report_destination(filename)
62cf7aaf 625 tmpfilename = self.temp_name(filename)
0487b407
RG
626
627 # Check for rtmpdump first
628 try:
629 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
630 except (OSError, IOError):
631 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
632 return False
633
634 # Download using rtmpdump. rtmpdump returns exit code 2 when
635 # the connection was interrumpted and resuming appears to be
636 # possible. This is part of rtmpdump's normal usage, AFAIK.
62cf7aaf 637 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
1c1821f8
RG
638 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
639 while retval == 2 or retval == 1:
62cf7aaf 640 prevsize = os.path.getsize(tmpfilename)
331ce0a0 641 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 642 time.sleep(5.0) # This seems to be needed
1c1821f8 643 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
62cf7aaf 644 cursize = os.path.getsize(tmpfilename)
e616ec0c
RG
645 if prevsize == cursize and retval == 1:
646 break
0487b407 647 if retval == 0:
62cf7aaf
RG
648 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
649 self.try_rename(tmpfilename, filename)
0487b407
RG
650 return True
651 else:
db7e31b8 652 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
653 return False
654
e616ec0c 655 def _do_download(self, filename, url, player_url):
62cf7aaf 656 # Check file already present
3fb2c487 657 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
62cf7aaf
RG
658 self.report_file_already_downloaded(filename)
659 return True
660
0487b407
RG
661 # Attempt to download using rtmpdump
662 if url.startswith('rtmp'):
e616ec0c 663 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 664
62cf7aaf 665 tmpfilename = self.temp_name(filename)
55e7c75e 666 stream = None
9c457d2a 667 open_mode = 'wb'
1987c232
RG
668
669 # Do not include the Accept-Encoding header
670 headers = {'Youtubedl-no-compression': 'True'}
671 basic_request = urllib2.Request(url, None, headers)
672 request = urllib2.Request(url, None, headers)
7db85b2c 673
9c457d2a 674 # Establish possible resume length
62cf7aaf
RG
675 if os.path.isfile(tmpfilename):
676 resume_len = os.path.getsize(tmpfilename)
55e7c75e
RG
677 else:
678 resume_len = 0
9c457d2a
RG
679
680 # Request parameters in case of being able to resume
850ab765 681 if self.params.get('continuedl', False) and resume_len != 0:
7db85b2c
RG
682 self.report_resuming_byte(resume_len)
683 request.add_header('Range','bytes=%d-' % resume_len)
9c457d2a 684 open_mode = 'ab'
55e7c75e 685
7031008c
RG
686 count = 0
687 retries = self.params.get('retries', 0)
101e0d1e 688 while count <= retries:
7031008c
RG
689 # Establish connection
690 try:
691 data = urllib2.urlopen(request)
692 break
693 except (urllib2.HTTPError, ), err:
ac249f42 694 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 695 # Unexpected HTTP error
7031008c 696 raise
101e0d1e
RG
697 elif err.code == 416:
698 # Unable to resume (requested range not satisfiable)
699 try:
700 # Open the connection again without the range header
701 data = urllib2.urlopen(basic_request)
702 content_length = data.info()['Content-Length']
703 except (urllib2.HTTPError, ), err:
ac249f42 704 if err.code < 500 or err.code >= 600:
101e0d1e
RG
705 raise
706 else:
707 # Examine the reported length
268fb2bd 708 if (content_length is not None and
204c9398 709 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
710 # The file had already been fully downloaded.
711 # Explanation to the above condition: in issue #175 it was revealed that
712 # YouTube sometimes adds or removes a few bytes from the end of the file,
713 # changing the file size slightly and causing problems for some users. So
714 # I decided to implement a suggested change and consider the file
715 # completely downloaded if the file size differs less than 100 bytes from
716 # the one in the hard drive.
101e0d1e 717 self.report_file_already_downloaded(filename)
62cf7aaf 718 self.try_rename(tmpfilename, filename)
101e0d1e
RG
719 return True
720 else:
721 # The length does not match, we start the download over
722 self.report_unable_to_resume()
723 open_mode = 'wb'
724 break
725 # Retry
726 count += 1
727 if count <= retries:
728 self.report_retry(count, retries)
729
730 if count > retries:
731 self.trouble(u'ERROR: giving up after %s retries' % retries)
732 return False
7db85b2c 733
4fa74b52 734 data_len = data.info().get('Content-length', None)
106d091e
RG
735 if data_len is not None:
736 data_len = long(data_len) + resume_len
4fa74b52 737 data_len_str = self.format_bytes(data_len)
106d091e 738 byte_counter = 0 + resume_len
4fa74b52
RG
739 block_size = 1024
740 start = time.time()
741 while True:
bafa5cd9 742 # Download and write
4fa74b52
RG
743 before = time.time()
744 data_block = data.read(block_size)
745 after = time.time()
975a91d0 746 if len(data_block) == 0:
4fa74b52 747 break
975a91d0 748 byte_counter += len(data_block)
55e7c75e
RG
749
750 # Open file just in time
751 if stream is None:
752 try:
62cf7aaf 753 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
8cc42e7c 754 filename = self.undo_temp_name(tmpfilename)
55e7c75e
RG
755 self.report_destination(filename)
756 except (OSError, IOError), err:
db7e31b8 757 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 758 return False
131efd1a
RG
759 try:
760 stream.write(data_block)
761 except (IOError, OSError), err:
d67e0974
RG
762 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
763 return False
975a91d0 764 block_size = self.best_block_size(after - before, len(data_block))
4fa74b52 765
55e7c75e
RG
766 # Progress message
767 percent_str = self.calc_percent(byte_counter, data_len)
975a91d0
RG
768 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
769 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
55e7c75e
RG
770 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
771
acd3d842 772 # Apply rate limit
975a91d0 773 self.slow_down(start, byte_counter - resume_len)
acd3d842 774
6f0ff3ba 775 stream.close()
bafa5cd9 776 self.report_finish()
b905e5f5 777 if data_len is not None and byte_counter != data_len:
d69a1c91 778 raise ContentTooShortError(byte_counter, long(data_len))
62cf7aaf 779 self.try_rename(tmpfilename, filename)
e3018902 780
09bd408c 781 # Update file modification time
e3018902
RG
782 if self.params.get('updatetime', True):
783 self.try_utime(filename, data.info().get('last-modified', None))
784
55e7c75e 785 return True
4fa74b52
RG
786
787class InfoExtractor(object):
788 """Information Extractor class.
789
790 Information extractors are the classes that, given a URL, extract
791 information from the video (or videos) the URL refers to. This
792 information includes the real video URL, the video title and simplified
2851b2ca
RG
793 title, author and others. The information is stored in a dictionary
794 which is then passed to the FileDownloader. The FileDownloader
795 processes this information possibly downloading the video to the file
796 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
797 the following fields:
798
799 id: Video identifier.
800 url: Final video URL.
801 uploader: Nickname of the video uploader.
802 title: Literal title.
803 stitle: Simplified title.
804 ext: Video filename extension.
6ba562b0 805 format: Video format.
e616ec0c 806 player_url: SWF Player URL (may be None).
4fa74b52 807
7e58d568
RG
808 The following fields are optional. Their primary purpose is to allow
809 youtube-dl to serve as the backend for a video search function, such
810 as the one in youtube2mp3. They are only used when their respective
811 forced printing functions are called:
812
813 thumbnail: Full URL to a video thumbnail image.
814 description: One-line video description.
815
4fa74b52
RG
816 Subclasses of this one should re-define the _real_initialize() and
817 _real_extract() methods, as well as the suitable() static method.
818 Probably, they should also be instantiated and added to the main
819 downloader.
820 """
821
822 _ready = False
823 _downloader = None
824
825 def __init__(self, downloader=None):
826 """Constructor. Receives an optional downloader."""
827 self._ready = False
828 self.set_downloader(downloader)
829
830 @staticmethod
831 def suitable(url):
832 """Receives a URL and returns True if suitable for this IE."""
020f7150 833 return False
4fa74b52
RG
834
835 def initialize(self):
1c5e2302 836 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
837 if not self._ready:
838 self._real_initialize()
839 self._ready = True
840
841 def extract(self, url):
842 """Extracts URL information and returns it in list of dicts."""
843 self.initialize()
844 return self._real_extract(url)
845
846 def set_downloader(self, downloader):
847 """Sets the downloader for this IE."""
848 self._downloader = downloader
d3975459 849
4fa74b52
RG
850 def _real_initialize(self):
851 """Real initialization process. Redefine in subclasses."""
852 pass
853
854 def _real_extract(self, url):
855 """Real extraction process. Redefine in subclasses."""
856 pass
857
858class YoutubeIE(InfoExtractor):
859 """Information extractor for youtube.com."""
860
da54ed44 861 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 862 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 863 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 864 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 865 _NETRC_MACHINE = 'youtube'
497cd3e6
RG
866 # Listed in order of quality
867 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
7b7759f5 868 _video_extensions = {
869 '13': '3gp',
870 '17': 'mp4',
871 '18': 'mp4',
872 '22': 'mp4',
d9bc015b 873 '37': 'mp4',
9e9647d9 874 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a
RG
875 '43': 'webm',
876 '45': 'webm',
7b7759f5 877 }
4fa74b52 878
020f7150
RG
879 @staticmethod
880 def suitable(url):
881 return (re.match(YoutubeIE._VALID_URL, url) is not None)
882
72ac78b8
RG
883 def report_lang(self):
884 """Report attempt to set language."""
331ce0a0 885 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 886
bafa5cd9
RG
887 def report_login(self):
888 """Report attempt to log in."""
331ce0a0 889 self._downloader.to_screen(u'[youtube] Logging in')
d3975459 890
bafa5cd9
RG
891 def report_age_confirmation(self):
892 """Report attempt to confirm age."""
331ce0a0 893 self._downloader.to_screen(u'[youtube] Confirming age')
d3975459 894
e616ec0c
RG
895 def report_video_webpage_download(self, video_id):
896 """Report attempt to download video webpage."""
331ce0a0 897 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
d3975459 898
71b7300e
RG
899 def report_video_info_webpage_download(self, video_id):
900 """Report attempt to download video info webpage."""
331ce0a0 901 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
d3975459 902
bafa5cd9
RG
903 def report_information_extraction(self, video_id):
904 """Report attempt to extract video information."""
331ce0a0 905 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
d3975459 906
7b7759f5 907 def report_unavailable_format(self, video_id, format):
908 """Report extracted video URL."""
331ce0a0 909 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
d3975459 910
0487b407
RG
911 def report_rtmp_download(self):
912 """Indicate the download will use the RTMP protocol."""
331ce0a0 913 self._downloader.to_screen(u'[youtube] RTMP download detected')
d3975459 914
4fa74b52
RG
915 def _real_initialize(self):
916 if self._downloader is None:
917 return
918
919 username = None
920 password = None
d0a9affb 921 downloader_params = self._downloader.params
4fa74b52
RG
922
923 # Attempt to use provided username and password or .netrc data
924 if downloader_params.get('username', None) is not None:
925 username = downloader_params['username']
926 password = downloader_params['password']
927 elif downloader_params.get('usenetrc', False):
928 try:
929 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
930 if info is not None:
931 username = info[0]
932 password = info[2]
933 else:
934 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
935 except (IOError, netrc.NetrcParseError), err:
6f21f686 936 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
937 return
938
72ac78b8 939 # Set language
1987c232 940 request = urllib2.Request(self._LANG_URL)
72ac78b8
RG
941 try:
942 self.report_lang()
943 urllib2.urlopen(request).read()
944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 945 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
946 return
947
cc109403
RG
948 # No authentication to be performed
949 if username is None:
950 return
951
4fa74b52 952 # Log in
9fcd8355
RG
953 login_form = {
954 'current_form': 'loginForm',
4fa74b52
RG
955 'next': '/',
956 'action_login': 'Log In',
957 'username': username,
9fcd8355
RG
958 'password': password,
959 }
1987c232 960 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
4fa74b52 961 try:
bafa5cd9 962 self.report_login()
4fa74b52
RG
963 login_results = urllib2.urlopen(request).read()
964 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 965 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
966 return
967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 968 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52 969 return
d3975459 970
4fa74b52 971 # Confirm age
9fcd8355
RG
972 age_form = {
973 'next_url': '/',
974 'action_confirm': 'Confirm',
975 }
1987c232 976 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
4fa74b52 977 try:
bafa5cd9 978 self.report_age_confirmation()
4fa74b52
RG
979 age_results = urllib2.urlopen(request).read()
980 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 981 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 982 return
4fa74b52
RG
983
984 def _real_extract(self, url):
985 # Extract video id from URL
020f7150 986 mobj = re.match(self._VALID_URL, url)
4fa74b52 987 if mobj is None:
147753eb 988 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 989 return
4fa74b52
RG
990 video_id = mobj.group(2)
991
497cd3e6
RG
992 # Get video webpage
993 self.report_video_webpage_download(video_id)
1987c232 994 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
497cd3e6
RG
995 try:
996 video_webpage = urllib2.urlopen(request).read()
997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
998 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
999 return
968aa884 1000
497cd3e6 1001 # Attempt to extract SWF player URL
b620a5f8 1002 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
497cd3e6 1003 if mobj is not None:
b620a5f8 1004 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
497cd3e6
RG
1005 else:
1006 player_url = None
1007
1008 # Get video info
1009 self.report_video_info_webpage_download(video_id)
1010 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1011 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1012 % (video_id, el_type))
1987c232 1013 request = urllib2.Request(video_info_url)
e616ec0c 1014 try:
497cd3e6
RG
1015 video_info_webpage = urllib2.urlopen(request).read()
1016 video_info = parse_qs(video_info_webpage)
1017 if 'token' in video_info:
1018 break
e616ec0c 1019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 1020 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 1021 return
f95f29fd
RG
1022 if 'token' not in video_info:
1023 if 'reason' in video_info:
8e686771 1024 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
1025 else:
1026 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1027 return
1028
1029 # Start extracting information
497cd3e6
RG
1030 self.report_information_extraction(video_id)
1031
1032 # uploader
1033 if 'author' not in video_info:
1034 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1035 return
1036 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 1037
497cd3e6
RG
1038 # title
1039 if 'title' not in video_info:
1040 self._downloader.trouble(u'ERROR: unable to extract video title')
1041 return
1042 video_title = urllib.unquote_plus(video_info['title'][0])
1043 video_title = video_title.decode('utf-8')
1044 video_title = sanitize_title(video_title)
1045
1046 # simplified title
1047 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1048 simple_title = simple_title.strip(ur'_')
1049
1050 # thumbnail image
1051 if 'thumbnail_url' not in video_info:
1052 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1053 video_thumbnail = ''
1054 else: # don't panic if we can't find it
1055 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1056
b3a27b52
NA
1057 # upload date
1058 upload_date = u'NA'
3efa45c3 1059 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
b3a27b52 1060 if mobj is not None:
a1f03c7b 1061 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
87cbd213 1062 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
a1f03c7b
NA
1063 for expression in format_expressions:
1064 try:
1065 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1066 except:
1067 pass
b3a27b52 1068
497cd3e6
RG
1069 # description
1070 video_description = 'No description available.'
1071 if self._downloader.params.get('forcedescription', False):
1072 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
e616ec0c 1073 if mobj is not None:
497cd3e6
RG
1074 video_description = mobj.group(1)
1075
5ce7d172
RG
1076 # token
1077 video_token = urllib.unquote_plus(video_info['token'][0])
1078
497cd3e6 1079 # Decide which formats to download
f83ae781 1080 req_format = self._downloader.params.get('format', None)
2e3a32e4 1081
8126094c
RG
1082 if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1083 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1084 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1085 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
497cd3e6
RG
1086 format_limit = self._downloader.params.get('format_limit', None)
1087 if format_limit is not None and format_limit in self._available_formats:
1088 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 1089 else:
497cd3e6
RG
1090 format_list = self._available_formats
1091 existing_formats = [x for x in format_list if x in url_map]
1092 if len(existing_formats) == 0:
1093 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 1094 return
f83ae781 1095 if req_format is None:
d157d259 1096 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
f83ae781 1097 elif req_format == '-1':
d157d259 1098 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
497cd3e6 1099 else:
5c132793
RG
1100 # Specific format
1101 if req_format not in url_map:
1102 self._downloader.trouble(u'ERROR: requested format not available')
1103 return
1104 video_url_list = [(req_format, url_map[req_format])] # Specific format
2e3a32e4 1105
497cd3e6
RG
1106 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1107 self.report_rtmp_download()
1108 video_url_list = [(None, video_info['conn'][0])]
2e3a32e4 1109
497cd3e6
RG
1110 else:
1111 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1112 return
7b7759f5 1113
497cd3e6
RG
1114 for format_param, video_real_url in video_url_list:
1115 # At this point we have a new video
1116 self._downloader.increment_downloads()
1117
1118 # Extension
1119 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 1120
497cd3e6 1121 # Find the video URL in fmt_url_map or conn paramters
968aa884 1122 try:
7b7759f5 1123 # Process video information
1124 self._downloader.process_info({
1125 'id': video_id.decode('utf-8'),
1126 'url': video_real_url.decode('utf-8'),
1127 'uploader': video_uploader.decode('utf-8'),
138b11f3 1128 'upload_date': upload_date,
7b7759f5 1129 'title': video_title,
1130 'stitle': simple_title,
1131 'ext': video_extension.decode('utf-8'),
6ba562b0 1132 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568
RG
1133 'thumbnail': video_thumbnail.decode('utf-8'),
1134 'description': video_description.decode('utf-8'),
e616ec0c 1135 'player_url': player_url,
7b7759f5 1136 })
497cd3e6 1137 except UnavailableVideoError, err:
09cc744c 1138 self._downloader.trouble(u'\nERROR: unable to download video')
42bcd27d 1139
4fa74b52 1140
020f7150
RG
1141class MetacafeIE(InfoExtractor):
1142 """Information Extractor for metacafe.com."""
1143
1144 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 1145 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 1146 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150
RG
1147 _youtube_ie = None
1148
1149 def __init__(self, youtube_ie, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1151 self._youtube_ie = youtube_ie
1152
1153 @staticmethod
1154 def suitable(url):
1155 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1156
1157 def report_disclaimer(self):
1158 """Report disclaimer retrieval."""
331ce0a0 1159 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
1160
1161 def report_age_confirmation(self):
1162 """Report attempt to confirm age."""
331ce0a0 1163 self._downloader.to_screen(u'[metacafe] Confirming age')
d3975459 1164
020f7150
RG
1165 def report_download_webpage(self, video_id):
1166 """Report webpage download."""
331ce0a0 1167 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
d3975459 1168
020f7150
RG
1169 def report_extraction(self, video_id):
1170 """Report information extraction."""
331ce0a0 1171 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
1172
1173 def _real_initialize(self):
1174 # Retrieve disclaimer
1987c232 1175 request = urllib2.Request(self._DISCLAIMER)
020f7150
RG
1176 try:
1177 self.report_disclaimer()
1178 disclaimer = urllib2.urlopen(request).read()
1179 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1180 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1181 return
1182
1183 # Confirm age
1184 disclaimer_form = {
2546e767 1185 'filters': '0',
020f7150
RG
1186 'submit': "Continue - I'm over 18",
1187 }
1987c232 1188 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
020f7150
RG
1189 try:
1190 self.report_age_confirmation()
1191 disclaimer = urllib2.urlopen(request).read()
1192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1193 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150 1194 return
d3975459 1195
020f7150
RG
1196 def _real_extract(self, url):
1197 # Extract id and simplified title from URL
1198 mobj = re.match(self._VALID_URL, url)
1199 if mobj is None:
147753eb 1200 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1201 return
020f7150
RG
1202
1203 video_id = mobj.group(1)
1204
1205 # Check if video comes from YouTube
1206 mobj2 = re.match(r'^yt-(.*)$', video_id)
1207 if mobj2 is not None:
6f21f686
RG
1208 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1209 return
020f7150 1210
df372a65 1211 # At this point we have a new video
9bf7fa52 1212 self._downloader.increment_downloads()
df372a65 1213
020f7150 1214 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1215
1216 # Retrieve video webpage to extract further information
1217 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1218 try:
1219 self.report_download_webpage(video_id)
1220 webpage = urllib2.urlopen(request).read()
1221 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1222 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1223 return
020f7150
RG
1224
1225 # Extract URL, uploader and title from webpage
1226 self.report_extraction(video_id)
18963a36 1227 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1228 if mobj is not None:
1229 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1230 video_extension = mediaURL[-3:]
d3975459 1231
c6c555cf
RG
1232 # Extract gdaKey if available
1233 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1234 if mobj is None:
1235 video_url = mediaURL
1236 else:
1237 gdaKey = mobj.group(1)
1238 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1239 else:
c6c555cf
RG
1240 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1241 if mobj is None:
1242 self._downloader.trouble(u'ERROR: unable to extract media URL')
1243 return
1244 vardict = parse_qs(mobj.group(1))
1245 if 'mediaData' not in vardict:
1246 self._downloader.trouble(u'ERROR: unable to extract media URL')
1247 return
1248 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1249 if mobj is None:
1250 self._downloader.trouble(u'ERROR: unable to extract media URL')
1251 return
6b57e8c5
RG
1252 mediaURL = mobj.group(1).replace('\\/', '/')
1253 video_extension = mediaURL[-3:]
1254 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1255
2546e767 1256 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1257 if mobj is None:
147753eb 1258 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1259 return
020f7150 1260 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1261 video_title = sanitize_title(video_title)
020f7150 1262
29f07568 1263 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1264 if mobj is None:
147753eb 1265 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1266 return
dbccb6cd 1267 video_uploader = mobj.group(1)
020f7150 1268
42bcd27d 1269 try:
1270 # Process video information
1271 self._downloader.process_info({
1272 'id': video_id.decode('utf-8'),
1273 'url': video_url.decode('utf-8'),
1274 'uploader': video_uploader.decode('utf-8'),
138b11f3 1275 'upload_date': u'NA',
42bcd27d 1276 'title': video_title,
1277 'stitle': simple_title,
1278 'ext': video_extension.decode('utf-8'),
6ba562b0 1279 'format': u'NA',
e616ec0c 1280 'player_url': None,
42bcd27d 1281 })
73f4e7af 1282 except UnavailableVideoError:
09cc744c 1283 self._downloader.trouble(u'\nERROR: unable to download video')
020f7150 1284
25af2bce 1285
4135fa45
WB
1286class DailymotionIE(InfoExtractor):
1287 """Information Extractor for Dailymotion"""
1288
1289 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
4135fa45
WB
1290
1291 def __init__(self, downloader=None):
1292 InfoExtractor.__init__(self, downloader)
1293
1294 @staticmethod
1295 def suitable(url):
1296 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1297
4135fa45
WB
1298 def report_download_webpage(self, video_id):
1299 """Report webpage download."""
331ce0a0 1300 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
d3975459 1301
4135fa45
WB
1302 def report_extraction(self, video_id):
1303 """Report information extraction."""
331ce0a0 1304 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45
WB
1305
1306 def _real_initialize(self):
1307 return
1308
4135fa45
WB
1309 def _real_extract(self, url):
1310 # Extract id and simplified title from URL
1311 mobj = re.match(self._VALID_URL, url)
1312 if mobj is None:
1313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1314 return
1315
df372a65 1316 # At this point we have a new video
9bf7fa52 1317 self._downloader.increment_downloads()
4135fa45
WB
1318 video_id = mobj.group(1)
1319
1320 simple_title = mobj.group(2).decode('utf-8')
1321 video_extension = 'flv'
1322
1323 # Retrieve video webpage to extract further information
1324 request = urllib2.Request(url)
1325 try:
1326 self.report_download_webpage(video_id)
1327 webpage = urllib2.urlopen(request).read()
1328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1329 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1330 return
1331
1332 # Extract URL, uploader and title from webpage
1333 self.report_extraction(video_id)
1334 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1335 if mobj is None:
1336 self._downloader.trouble(u'ERROR: unable to extract media URL')
1337 return
1338 mediaURL = urllib.unquote(mobj.group(1))
1339
1340 # if needed add http://www.dailymotion.com/ if relative URL
1341
1342 video_url = mediaURL
1343
1344 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1345 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1346 if mobj is None:
1347 self._downloader.trouble(u'ERROR: unable to extract title')
1348 return
1349 video_title = mobj.group(1).decode('utf-8')
1350 video_title = sanitize_title(video_title)
1351
c02d8e40 1352 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
4135fa45
WB
1353 if mobj is None:
1354 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1355 return
1356 video_uploader = mobj.group(1)
1357
1358 try:
1359 # Process video information
1360 self._downloader.process_info({
1361 'id': video_id.decode('utf-8'),
1362 'url': video_url.decode('utf-8'),
1363 'uploader': video_uploader.decode('utf-8'),
138b11f3 1364 'upload_date': u'NA',
4135fa45
WB
1365 'title': video_title,
1366 'stitle': simple_title,
1367 'ext': video_extension.decode('utf-8'),
1368 'format': u'NA',
1369 'player_url': None,
1370 })
73f4e7af 1371 except UnavailableVideoError:
09cc744c 1372 self._downloader.trouble(u'\nERROR: unable to download video')
4135fa45 1373
49c0028a 1374class GoogleIE(InfoExtractor):
1375 """Information extractor for video.google.com."""
1376
490fd7ae 1377 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
49c0028a 1378
1379 def __init__(self, downloader=None):
1380 InfoExtractor.__init__(self, downloader)
1381
1382 @staticmethod
1383 def suitable(url):
1384 return (re.match(GoogleIE._VALID_URL, url) is not None)
1385
1386 def report_download_webpage(self, video_id):
1387 """Report webpage download."""
331ce0a0 1388 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1389
1390 def report_extraction(self, video_id):
1391 """Report information extraction."""
331ce0a0 1392 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1393
1394 def _real_initialize(self):
1395 return
1396
1397 def _real_extract(self, url):
1398 # Extract id from URL
1399 mobj = re.match(self._VALID_URL, url)
1400 if mobj is None:
1401 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1402 return
1403
df372a65 1404 # At this point we have a new video
9bf7fa52 1405 self._downloader.increment_downloads()
49c0028a 1406 video_id = mobj.group(1)
1407
1408 video_extension = 'mp4'
1409
1410 # Retrieve video webpage to extract further information
490fd7ae 1411 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1412 try:
1413 self.report_download_webpage(video_id)
1414 webpage = urllib2.urlopen(request).read()
1415 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1416 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1417 return
1418
1419 # Extract URL, uploader, and title from webpage
1420 self.report_extraction(video_id)
490fd7ae
RG
1421 mobj = re.search(r"download_url:'([^']+)'", webpage)
1422 if mobj is None:
1423 video_extension = 'flv'
1424 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1425 if mobj is None:
1426 self._downloader.trouble(u'ERROR: unable to extract media URL')
1427 return
1428 mediaURL = urllib.unquote(mobj.group(1))
1429 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1430 mediaURL = mediaURL.replace('\\x26', '\x26')
1431
1432 video_url = mediaURL
1433
1434 mobj = re.search(r'<title>(.*)</title>', webpage)
1435 if mobj is None:
1436 self._downloader.trouble(u'ERROR: unable to extract title')
1437 return
1438 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1439 video_title = sanitize_title(video_title)
31cbdaaf 1440 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1441
7e58d568
RG
1442 # Extract video description
1443 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1444 if mobj is None:
1445 self._downloader.trouble(u'ERROR: unable to extract video description')
1446 return
1447 video_description = mobj.group(1).decode('utf-8')
1448 if not video_description:
1449 video_description = 'No description available.'
1450
1451 # Extract video thumbnail
1452 if self._downloader.params.get('forcethumbnail', False):
1453 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1454 try:
1455 webpage = urllib2.urlopen(request).read()
1456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1458 return
1459 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1460 if mobj is None:
1461 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1462 return
1463 video_thumbnail = mobj.group(1)
1464 else: # we need something to pass to process_info
1465 video_thumbnail = ''
1466
1467
49c0028a 1468 try:
1469 # Process video information
1470 self._downloader.process_info({
1471 'id': video_id.decode('utf-8'),
1472 'url': video_url.decode('utf-8'),
6ba562b0 1473 'uploader': u'NA',
138b11f3 1474 'upload_date': u'NA',
490fd7ae 1475 'title': video_title,
31cbdaaf 1476 'stitle': simple_title,
49c0028a 1477 'ext': video_extension.decode('utf-8'),
6ba562b0 1478 'format': u'NA',
e616ec0c 1479 'player_url': None,
49c0028a 1480 })
73f4e7af 1481 except UnavailableVideoError:
09cc744c 1482 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1483
1484
1485class PhotobucketIE(InfoExtractor):
1486 """Information extractor for photobucket.com."""
1487
1488 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1489
1490 def __init__(self, downloader=None):
1491 InfoExtractor.__init__(self, downloader)
1492
1493 @staticmethod
1494 def suitable(url):
1495 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1496
1497 def report_download_webpage(self, video_id):
1498 """Report webpage download."""
331ce0a0 1499 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1500
1501 def report_extraction(self, video_id):
1502 """Report information extraction."""
331ce0a0 1503 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1504
1505 def _real_initialize(self):
1506 return
1507
1508 def _real_extract(self, url):
1509 # Extract id from URL
1510 mobj = re.match(self._VALID_URL, url)
1511 if mobj is None:
1512 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1513 return
1514
df372a65 1515 # At this point we have a new video
9bf7fa52 1516 self._downloader.increment_downloads()
49c0028a 1517 video_id = mobj.group(1)
1518
1519 video_extension = 'flv'
1520
1521 # Retrieve video webpage to extract further information
1522 request = urllib2.Request(url)
1523 try:
1524 self.report_download_webpage(video_id)
1525 webpage = urllib2.urlopen(request).read()
1526 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1527 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1528 return
1529
1530 # Extract URL, uploader, and title from webpage
1531 self.report_extraction(video_id)
1532 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1533 if mobj is None:
1534 self._downloader.trouble(u'ERROR: unable to extract media URL')
1535 return
1536 mediaURL = urllib.unquote(mobj.group(1))
1537
1538 video_url = mediaURL
1539
1540 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1541 if mobj is None:
1542 self._downloader.trouble(u'ERROR: unable to extract title')
1543 return
1544 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1545 video_title = sanitize_title(video_title)
31cbdaaf 1546 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1547
1548 video_uploader = mobj.group(2).decode('utf-8')
1549
1550 try:
1551 # Process video information
1552 self._downloader.process_info({
1553 'id': video_id.decode('utf-8'),
1554 'url': video_url.decode('utf-8'),
490fd7ae 1555 'uploader': video_uploader,
138b11f3 1556 'upload_date': u'NA',
490fd7ae 1557 'title': video_title,
31cbdaaf 1558 'stitle': simple_title,
490fd7ae 1559 'ext': video_extension.decode('utf-8'),
6ba562b0 1560 'format': u'NA',
e616ec0c 1561 'player_url': None,
490fd7ae 1562 })
73f4e7af 1563 except UnavailableVideoError:
09cc744c 1564 self._downloader.trouble(u'\nERROR: unable to download video')
490fd7ae
RG
1565
1566
61945318
RG
1567class YahooIE(InfoExtractor):
1568 """Information extractor for video.yahoo.com."""
1569
1570 # _VALID_URL matches all Yahoo! Video URLs
1571 # _VPAGE_URL matches only the extractable '/watch/' URLs
1572 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1573 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1574
1575 def __init__(self, downloader=None):
1576 InfoExtractor.__init__(self, downloader)
1577
1578 @staticmethod
1579 def suitable(url):
1580 return (re.match(YahooIE._VALID_URL, url) is not None)
1581
1582 def report_download_webpage(self, video_id):
1583 """Report webpage download."""
331ce0a0 1584 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1585
1586 def report_extraction(self, video_id):
1587 """Report information extraction."""
331ce0a0 1588 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318
RG
1589
1590 def _real_initialize(self):
1591 return
1592
df372a65 1593 def _real_extract(self, url, new_video=True):
61945318
RG
1594 # Extract ID from URL
1595 mobj = re.match(self._VALID_URL, url)
1596 if mobj is None:
1597 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1598 return
1599
df372a65 1600 # At this point we have a new video
9bf7fa52 1601 self._downloader.increment_downloads()
61945318
RG
1602 video_id = mobj.group(2)
1603 video_extension = 'flv'
1604
1605 # Rewrite valid but non-extractable URLs as
1606 # extractable English language /watch/ URLs
1607 if re.match(self._VPAGE_URL, url) is None:
1608 request = urllib2.Request(url)
1609 try:
1610 webpage = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1613 return
1614
1615 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1616 if mobj is None:
1617 self._downloader.trouble(u'ERROR: Unable to extract id field')
1618 return
1619 yahoo_id = mobj.group(1)
1620
1621 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1622 if mobj is None:
1623 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1624 return
1625 yahoo_vid = mobj.group(1)
1626
1627 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1628 return self._real_extract(url, new_video=False)
61945318
RG
1629
1630 # Retrieve video webpage to extract further information
1631 request = urllib2.Request(url)
1632 try:
1633 self.report_download_webpage(video_id)
1634 webpage = urllib2.urlopen(request).read()
1635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1637 return
1638
1639 # Extract uploader and title from webpage
1640 self.report_extraction(video_id)
1641 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1642 if mobj is None:
1643 self._downloader.trouble(u'ERROR: unable to extract video title')
1644 return
1645 video_title = mobj.group(1).decode('utf-8')
1646 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1647
1648 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1649 if mobj is None:
1650 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1651 return
1652 video_uploader = mobj.group(1).decode('utf-8')
1653
7e58d568
RG
1654 # Extract video thumbnail
1655 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1656 if mobj is None:
1657 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1658 return
1659 video_thumbnail = mobj.group(1).decode('utf-8')
1660
1661 # Extract video description
1662 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1663 if mobj is None:
1664 self._downloader.trouble(u'ERROR: unable to extract video description')
1665 return
1666 video_description = mobj.group(1).decode('utf-8')
1667 if not video_description: video_description = 'No description available.'
1668
61945318
RG
1669 # Extract video height and width
1670 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1671 if mobj is None:
1672 self._downloader.trouble(u'ERROR: unable to extract video height')
1673 return
1674 yv_video_height = mobj.group(1)
1675
1676 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1677 if mobj is None:
1678 self._downloader.trouble(u'ERROR: unable to extract video width')
1679 return
1680 yv_video_width = mobj.group(1)
1681
1682 # Retrieve video playlist to extract media URL
1683 # I'm not completely sure what all these options are, but we
1684 # seem to need most of them, otherwise the server sends a 401.
1685 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1686 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1687 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1688 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1689 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1690 try:
1691 self.report_download_webpage(video_id)
1692 webpage = urllib2.urlopen(request).read()
1693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1694 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1695 return
1696
1697 # Extract media URL from playlist XML
1698 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1699 if mobj is None:
1700 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1701 return
1702 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1703 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1704
1705 try:
1706 # Process video information
1707 self._downloader.process_info({
1708 'id': video_id.decode('utf-8'),
1709 'url': video_url,
1710 'uploader': video_uploader,
138b11f3 1711 'upload_date': u'NA',
61945318
RG
1712 'title': video_title,
1713 'stitle': simple_title,
1714 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1715 'thumbnail': video_thumbnail.decode('utf-8'),
1716 'description': video_description,
1717 'thumbnail': video_thumbnail,
1718 'description': video_description,
e616ec0c 1719 'player_url': None,
61945318 1720 })
73f4e7af 1721 except UnavailableVideoError:
09cc744c 1722 self._downloader.trouble(u'\nERROR: unable to download video')
61945318
RG
1723
1724
490fd7ae
RG
1725class GenericIE(InfoExtractor):
1726 """Generic last-resort information extractor."""
1727
1728 def __init__(self, downloader=None):
1729 InfoExtractor.__init__(self, downloader)
1730
1731 @staticmethod
1732 def suitable(url):
1733 return True
1734
1735 def report_download_webpage(self, video_id):
1736 """Report webpage download."""
331ce0a0
RG
1737 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1738 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
1739
1740 def report_extraction(self, video_id):
1741 """Report information extraction."""
331ce0a0 1742 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae
RG
1743
1744 def _real_initialize(self):
1745 return
1746
1747 def _real_extract(self, url):
df372a65 1748 # At this point we have a new video
9bf7fa52 1749 self._downloader.increment_downloads()
df372a65 1750
490fd7ae
RG
1751 video_id = url.split('/')[-1]
1752 request = urllib2.Request(url)
1753 try:
1754 self.report_download_webpage(video_id)
1755 webpage = urllib2.urlopen(request).read()
1756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1758 return
1759 except ValueError, err:
1760 # since this is the last-resort InfoExtractor, if
1761 # this error is thrown, it'll be thrown here
1762 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1763 return
1764
a9806fd8 1765 self.report_extraction(video_id)
490fd7ae
RG
1766 # Start with something easy: JW Player in SWFObject
1767 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1768 if mobj is None:
1769 # Broaden the search a little bit
1770 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1771 if mobj is None:
1772 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1773 return
1774
1775 # It's possible that one of the regexes
1776 # matched, but returned an empty group:
1777 if mobj.group(1) is None:
1778 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1779 return
1780
1781 video_url = urllib.unquote(mobj.group(1))
1782 video_id = os.path.basename(video_url)
1783
1784 # here's a fun little line of code for you:
1785 video_extension = os.path.splitext(video_id)[1][1:]
1786 video_id = os.path.splitext(video_id)[0]
1787
1788 # it's tempting to parse this further, but you would
1789 # have to take into account all the variations like
1790 # Video Title - Site Name
1791 # Site Name | Video Title
1792 # Video Title - Tagline | Site Name
1793 # and so on and so forth; it's just not practical
1794 mobj = re.search(r'<title>(.*)</title>', webpage)
1795 if mobj is None:
1796 self._downloader.trouble(u'ERROR: unable to extract title')
1797 return
1798 video_title = mobj.group(1).decode('utf-8')
1799 video_title = sanitize_title(video_title)
31cbdaaf 1800 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
490fd7ae
RG
1801
1802 # video uploader is domain name
1803 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1804 if mobj is None:
1805 self._downloader.trouble(u'ERROR: unable to extract title')
1806 return
1807 video_uploader = mobj.group(1).decode('utf-8')
1808
1809 try:
1810 # Process video information
1811 self._downloader.process_info({
1812 'id': video_id.decode('utf-8'),
1813 'url': video_url.decode('utf-8'),
1814 'uploader': video_uploader,
138b11f3 1815 'upload_date': u'NA',
490fd7ae 1816 'title': video_title,
31cbdaaf 1817 'stitle': simple_title,
49c0028a 1818 'ext': video_extension.decode('utf-8'),
6ba562b0 1819 'format': u'NA',
e616ec0c 1820 'player_url': None,
49c0028a 1821 })
73f4e7af 1822 except UnavailableVideoError, err:
09cc744c 1823 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1824
1825
25af2bce
RG
1826class YoutubeSearchIE(InfoExtractor):
1827 """Information Extractor for YouTube search queries."""
1828 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1829 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1830 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 1831 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 1832 _youtube_ie = None
fd9288c3 1833 _max_youtube_results = 1000
25af2bce 1834
f995f712 1835 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
1836 InfoExtractor.__init__(self, downloader)
1837 self._youtube_ie = youtube_ie
d3975459 1838
25af2bce
RG
1839 @staticmethod
1840 def suitable(url):
1841 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1842
1843 def report_download_page(self, query, pagenum):
1844 """Report attempt to download playlist page with given number."""
490fd7ae 1845 query = query.decode(preferredencoding())
331ce0a0 1846 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
1847
1848 def _real_initialize(self):
1849 self._youtube_ie.initialize()
d3975459 1850
25af2bce
RG
1851 def _real_extract(self, query):
1852 mobj = re.match(self._VALID_QUERY, query)
1853 if mobj is None:
147753eb 1854 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 1855 return
25af2bce
RG
1856
1857 prefix, query = query.split(':')
1858 prefix = prefix[8:]
490fd7ae 1859 query = query.encode('utf-8')
f995f712 1860 if prefix == '':
6f21f686
RG
1861 self._download_n_results(query, 1)
1862 return
f995f712 1863 elif prefix == 'all':
6f21f686
RG
1864 self._download_n_results(query, self._max_youtube_results)
1865 return
f995f712 1866 else:
25af2bce 1867 try:
e1f18b8a 1868 n = long(prefix)
25af2bce 1869 if n <= 0:
147753eb 1870 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 1871 return
257453b9 1872 elif n > self._max_youtube_results:
6f21f686 1873 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 1874 n = self._max_youtube_results
6f21f686
RG
1875 self._download_n_results(query, n)
1876 return
e1f18b8a 1877 except ValueError: # parsing prefix as integer fails
6f21f686
RG
1878 self._download_n_results(query, 1)
1879 return
25af2bce
RG
1880
1881 def _download_n_results(self, query, n):
1882 """Downloads a specified number of results for a query"""
1883
1884 video_ids = []
1885 already_seen = set()
1886 pagenum = 1
1887
1888 while True:
1889 self.report_download_page(query, pagenum)
a9633f14 1890 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 1891 request = urllib2.Request(result_url)
25af2bce
RG
1892 try:
1893 page = urllib2.urlopen(request).read()
1894 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1895 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 1896 return
25af2bce
RG
1897
1898 # Extract video identifiers
1899 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1900 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1901 if video_id not in already_seen:
1902 video_ids.append(video_id)
1903 already_seen.add(video_id)
1904 if len(video_ids) == n:
1905 # Specified n videos reached
25af2bce 1906 for id in video_ids:
6f21f686
RG
1907 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1908 return
25af2bce 1909
304a4d85 1910 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 1911 for id in video_ids:
6f21f686
RG
1912 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1913 return
25af2bce
RG
1914
1915 pagenum = pagenum + 1
1916
7e58d568
RG
1917class GoogleSearchIE(InfoExtractor):
1918 """Information Extractor for Google Video search queries."""
1919 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1920 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1921 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1922 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1923 _google_ie = None
1924 _max_google_results = 1000
1925
1926 def __init__(self, google_ie, downloader=None):
1927 InfoExtractor.__init__(self, downloader)
1928 self._google_ie = google_ie
d3975459 1929
7e58d568
RG
1930 @staticmethod
1931 def suitable(url):
1932 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1933
1934 def report_download_page(self, query, pagenum):
1935 """Report attempt to download playlist page with given number."""
1936 query = query.decode(preferredencoding())
331ce0a0 1937 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
1938
1939 def _real_initialize(self):
1940 self._google_ie.initialize()
d3975459 1941
7e58d568
RG
1942 def _real_extract(self, query):
1943 mobj = re.match(self._VALID_QUERY, query)
1944 if mobj is None:
1945 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1946 return
1947
1948 prefix, query = query.split(':')
1949 prefix = prefix[8:]
1950 query = query.encode('utf-8')
1951 if prefix == '':
1952 self._download_n_results(query, 1)
1953 return
1954 elif prefix == 'all':
1955 self._download_n_results(query, self._max_google_results)
1956 return
1957 else:
1958 try:
1959 n = long(prefix)
1960 if n <= 0:
1961 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1962 return
1963 elif n > self._max_google_results:
1964 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1965 n = self._max_google_results
1966 self._download_n_results(query, n)
1967 return
1968 except ValueError: # parsing prefix as integer fails
1969 self._download_n_results(query, 1)
1970 return
1971
1972 def _download_n_results(self, query, n):
1973 """Downloads a specified number of results for a query"""
1974
1975 video_ids = []
1976 already_seen = set()
1977 pagenum = 1
1978
1979 while True:
1980 self.report_download_page(query, pagenum)
1981 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 1982 request = urllib2.Request(result_url)
7e58d568
RG
1983 try:
1984 page = urllib2.urlopen(request).read()
1985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1986 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1987 return
1988
1989 # Extract video identifiers
1990 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1991 video_id = mobj.group(1)
1992 if video_id not in already_seen:
1993 video_ids.append(video_id)
1994 already_seen.add(video_id)
1995 if len(video_ids) == n:
1996 # Specified n videos reached
1997 for id in video_ids:
1998 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1999 return
2000
2001 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2002 for id in video_ids:
2003 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2004 return
2005
2006 pagenum = pagenum + 1
2007
2008class YahooSearchIE(InfoExtractor):
2009 """Information Extractor for Yahoo! Video search queries."""
2010 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2011 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2012 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2013 _MORE_PAGES_INDICATOR = r'\s*Next'
2014 _yahoo_ie = None
2015 _max_yahoo_results = 1000
2016
2017 def __init__(self, yahoo_ie, downloader=None):
2018 InfoExtractor.__init__(self, downloader)
2019 self._yahoo_ie = yahoo_ie
d3975459 2020
7e58d568
RG
2021 @staticmethod
2022 def suitable(url):
2023 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2024
2025 def report_download_page(self, query, pagenum):
2026 """Report attempt to download playlist page with given number."""
2027 query = query.decode(preferredencoding())
331ce0a0 2028 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2029
2030 def _real_initialize(self):
2031 self._yahoo_ie.initialize()
d3975459 2032
7e58d568
RG
2033 def _real_extract(self, query):
2034 mobj = re.match(self._VALID_QUERY, query)
2035 if mobj is None:
2036 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2037 return
2038
2039 prefix, query = query.split(':')
2040 prefix = prefix[8:]
2041 query = query.encode('utf-8')
2042 if prefix == '':
2043 self._download_n_results(query, 1)
2044 return
2045 elif prefix == 'all':
2046 self._download_n_results(query, self._max_yahoo_results)
2047 return
2048 else:
2049 try:
2050 n = long(prefix)
2051 if n <= 0:
2052 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2053 return
2054 elif n > self._max_yahoo_results:
2055 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2056 n = self._max_yahoo_results
2057 self._download_n_results(query, n)
2058 return
2059 except ValueError: # parsing prefix as integer fails
2060 self._download_n_results(query, 1)
2061 return
2062
2063 def _download_n_results(self, query, n):
2064 """Downloads a specified number of results for a query"""
2065
2066 video_ids = []
2067 already_seen = set()
2068 pagenum = 1
2069
2070 while True:
2071 self.report_download_page(query, pagenum)
2072 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2073 request = urllib2.Request(result_url)
7e58d568
RG
2074 try:
2075 page = urllib2.urlopen(request).read()
2076 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2077 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2078 return
2079
2080 # Extract video identifiers
2081 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2082 video_id = mobj.group(1)
2083 if video_id not in already_seen:
2084 video_ids.append(video_id)
2085 already_seen.add(video_id)
2086 if len(video_ids) == n:
2087 # Specified n videos reached
2088 for id in video_ids:
2089 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2090 return
2091
2092 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2093 for id in video_ids:
2094 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2095 return
2096
2097 pagenum = pagenum + 1
2098
0c2dc87d
RG
2099class YoutubePlaylistIE(InfoExtractor):
2100 """Information Extractor for YouTube playlists."""
2101
d119b54d 2102 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
f74e22ae 2103 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
0c2dc87d 2104 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 2105 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d
RG
2106 _youtube_ie = None
2107
2108 def __init__(self, youtube_ie, downloader=None):
2109 InfoExtractor.__init__(self, downloader)
2110 self._youtube_ie = youtube_ie
d3975459 2111
0c2dc87d
RG
2112 @staticmethod
2113 def suitable(url):
2114 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2115
2116 def report_download_page(self, playlist_id, pagenum):
2117 """Report attempt to download playlist page with given number."""
331ce0a0 2118 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
2119
2120 def _real_initialize(self):
2121 self._youtube_ie.initialize()
d3975459 2122
0c2dc87d
RG
2123 def _real_extract(self, url):
2124 # Extract playlist id
2125 mobj = re.match(self._VALID_URL, url)
2126 if mobj is None:
147753eb 2127 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 2128 return
0c2dc87d 2129
d119b54d
RG
2130 # Single video case
2131 if mobj.group(3) is not None:
2132 self._youtube_ie.extract(mobj.group(3))
2133 return
2134
0c2dc87d 2135 # Download playlist pages
f74e22ae
GI
2136 # prefix is 'p' as default for playlists but there are other types that need extra care
2137 playlist_prefix = mobj.group(1)
2138 if playlist_prefix == 'a':
2139 playlist_access = 'artist'
2140 else:
7cc3c6fd 2141 playlist_prefix = 'p'
f74e22ae
GI
2142 playlist_access = 'view_play_list'
2143 playlist_id = mobj.group(2)
0c2dc87d
RG
2144 video_ids = []
2145 pagenum = 1
2146
2147 while True:
2148 self.report_download_page(playlist_id, pagenum)
f74e22ae 2149 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
0c2dc87d
RG
2150 try:
2151 page = urllib2.urlopen(request).read()
2152 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2153 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2154 return
0c2dc87d
RG
2155
2156 # Extract video identifiers
27d98b6e 2157 ids_in_page = []
0c2dc87d 2158 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
2159 if mobj.group(1) not in ids_in_page:
2160 ids_in_page.append(mobj.group(1))
2161 video_ids.extend(ids_in_page)
0c2dc87d 2162
ce5cafea 2163 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
2164 break
2165 pagenum = pagenum + 1
2166
8cc44341
RG
2167 playliststart = self._downloader.params.get('playliststart', 1) - 1
2168 playlistend = self._downloader.params.get('playlistend', -1)
2169 video_ids = video_ids[playliststart:playlistend]
2170
0c2dc87d 2171 for id in video_ids:
6f21f686
RG
2172 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2173 return
0c2dc87d 2174
c39c05cd
A
2175class YoutubeUserIE(InfoExtractor):
2176 """Information Extractor for YouTube users."""
2177
5aba6ea4 2178 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
c39c05cd 2179 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
5aba6ea4
RG
2180 _GDATA_PAGE_SIZE = 50
2181 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2182 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
c39c05cd
A
2183 _youtube_ie = None
2184
2185 def __init__(self, youtube_ie, downloader=None):
2186 InfoExtractor.__init__(self, downloader)
2187 self._youtube_ie = youtube_ie
d3975459 2188
c39c05cd
A
2189 @staticmethod
2190 def suitable(url):
2191 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2192
5aba6ea4 2193 def report_download_page(self, username, start_index):
c39c05cd 2194 """Report attempt to download user page."""
5aba6ea4
RG
2195 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2196 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
c39c05cd
A
2197
2198 def _real_initialize(self):
2199 self._youtube_ie.initialize()
d3975459 2200
c39c05cd
A
2201 def _real_extract(self, url):
2202 # Extract username
2203 mobj = re.match(self._VALID_URL, url)
2204 if mobj is None:
2205 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2206 return
2207
c39c05cd 2208 username = mobj.group(1)
5aba6ea4
RG
2209
2210 # Download video ids using YouTube Data API. Result size per
2211 # query is limited (currently to 50 videos) so we need to query
2212 # page by page until there are no video ids - it means we got
2213 # all of them.
2214
c39c05cd 2215 video_ids = []
5aba6ea4 2216 pagenum = 0
c39c05cd 2217
5aba6ea4
RG
2218 while True:
2219 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2220 self.report_download_page(username, start_index)
c39c05cd 2221
5aba6ea4 2222 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
c39c05cd 2223
5aba6ea4
RG
2224 try:
2225 page = urllib2.urlopen(request).read()
2226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2227 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2228 return
c39c05cd 2229
5aba6ea4
RG
2230 # Extract video identifiers
2231 ids_in_page = []
2232
2233 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2234 if mobj.group(1) not in ids_in_page:
2235 ids_in_page.append(mobj.group(1))
2236
2237 video_ids.extend(ids_in_page)
2238
2239 # A little optimization - if current page is not
2240 # "full", ie. does not contain PAGE_SIZE video ids then
2241 # we can assume that this page is the last one - there
2242 # are no more ids on further pages - no need to query
2243 # again.
2244
2245 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2246 break
2247
2248 pagenum += 1
2249
2250 all_ids_count = len(video_ids)
8cc44341
RG
2251 playliststart = self._downloader.params.get('playliststart', 1) - 1
2252 playlistend = self._downloader.params.get('playlistend', -1)
204c9398 2253
5aba6ea4
RG
2254 if playlistend == -1:
2255 video_ids = video_ids[playliststart:]
2256 else:
2257 video_ids = video_ids[playliststart:playlistend]
2258
2259 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2260 (username, all_ids_count, len(video_ids)))
2261
2262 for video_id in video_ids:
2263 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2264
c39c05cd 2265
27179cfd
VV
2266class DepositFilesIE(InfoExtractor):
2267 """Information extractor for depositfiles.com"""
2268
2269 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2270
2271 def __init__(self, downloader=None):
2272 InfoExtractor.__init__(self, downloader)
2273
2274 @staticmethod
2275 def suitable(url):
2276 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2277
2278 def report_download_webpage(self, file_id):
2279 """Report webpage download."""
2280 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2281
2282 def report_extraction(self, file_id):
2283 """Report information extraction."""
2284 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2285
2286 def _real_initialize(self):
2287 return
2288
2289 def _real_extract(self, url):
2290 # At this point we have a new file
2291 self._downloader.increment_downloads()
2292
2293 file_id = url.split('/')[-1]
2294 # Rebuild url in english locale
2295 url = 'http://depositfiles.com/en/files/' + file_id
2296
2297 # Retrieve file webpage with 'Free download' button pressed
2298 free_download_indication = { 'gateway_result' : '1' }
1987c232 2299 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
27179cfd
VV
2300 try:
2301 self.report_download_webpage(file_id)
2302 webpage = urllib2.urlopen(request).read()
2303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2305 return
2306
2307 # Search for the real file URL
2308 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2309 if (mobj is None) or (mobj.group(1) is None):
2310 # Try to figure out reason of the error.
2311 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2312 if (mobj is not None) and (mobj.group(1) is not None):
2313 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2314 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2315 else:
2316 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2317 return
2318
2319 file_url = mobj.group(1)
2320 file_extension = os.path.splitext(file_url)[1][1:]
2321
2322 # Search for file title
2323 mobj = re.search(r'<b title="(.*?)">', webpage)
2324 if mobj is None:
2325 self._downloader.trouble(u'ERROR: unable to extract title')
2326 return
2327 file_title = mobj.group(1).decode('utf-8')
2328
2329 try:
2330 # Process file information
2331 self._downloader.process_info({
2332 'id': file_id.decode('utf-8'),
2333 'url': file_url.decode('utf-8'),
2334 'uploader': u'NA',
2335 'upload_date': u'NA',
2336 'title': file_title,
2337 'stitle': file_title,
2338 'ext': file_extension.decode('utf-8'),
2339 'format': u'NA',
2340 'player_url': None,
2341 })
2342 except UnavailableVideoError, err:
2343 self._downloader.trouble(u'ERROR: unable to download file')
2344
9f5f9602
GI
2345class FacebookIE(InfoExtractor):
2346 """Information Extractor for Facebook"""
2347
2348 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2349 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2350 _NETRC_MACHINE = 'facebook'
2351 _available_formats = ['highqual', 'lowqual']
2352 _video_extensions = {
2353 'highqual': 'mp4',
2354 'lowqual': 'mp4',
2355 }
2356
2357 def __init__(self, downloader=None):
2358 InfoExtractor.__init__(self, downloader)
2359
2360 @staticmethod
2361 def suitable(url):
2362 return (re.match(FacebookIE._VALID_URL, url) is not None)
2363
2364 def _reporter(self, message):
2365 """Add header and report message."""
2366 self._downloader.to_screen(u'[facebook] %s' % message)
2367
2368 def report_login(self):
2369 """Report attempt to log in."""
2370 self._reporter(u'Logging in')
2371
2372 def report_video_webpage_download(self, video_id):
2373 """Report attempt to download video webpage."""
2374 self._reporter(u'%s: Downloading video webpage' % video_id)
2375
2376 def report_information_extraction(self, video_id):
2377 """Report attempt to extract video information."""
2378 self._reporter(u'%s: Extracting video information' % video_id)
2379
2380 def _parse_page(self, video_webpage):
2381 """Extract video information from page"""
2382 # General data
2383 data = {'title': r'class="video_title datawrap">(.*?)</',
2384 'description': r'<div class="datawrap">(.*?)</div>',
2385 'owner': r'\("video_owner_name", "(.*?)"\)',
2386 'upload_date': r'data-date="(.*?)"',
2387 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2388 }
2389 video_info = {}
2390 for piece in data.keys():
2391 mobj = re.search(data[piece], video_webpage)
2392 if mobj is not None:
2393 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2394
2395 # Video urls
2396 video_urls = {}
2397 for fmt in self._available_formats:
2398 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2399 if mobj is not None:
2400 # URL is in a Javascript segment inside an escaped Unicode format within
2401 # the generally utf-8 page
2402 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2403 video_info['video_urls'] = video_urls
2404
2405 return video_info
2406
2407 def _real_initialize(self):
2408 if self._downloader is None:
2409 return
2410
2411 useremail = None
2412 password = None
2413 downloader_params = self._downloader.params
2414
2415 # Attempt to use provided username and password or .netrc data
2416 if downloader_params.get('username', None) is not None:
2417 useremail = downloader_params['username']
2418 password = downloader_params['password']
2419 elif downloader_params.get('usenetrc', False):
2420 try:
2421 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2422 if info is not None:
2423 useremail = info[0]
2424 password = info[2]
2425 else:
2426 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2427 except (IOError, netrc.NetrcParseError), err:
2428 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2429 return
2430
2431 if useremail is None:
2432 return
2433
2434 # Log in
2435 login_form = {
2436 'email': useremail,
2437 'pass': password,
2438 'login': 'Log+In'
2439 }
2440 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2441 try:
2442 self.report_login()
2443 login_results = urllib2.urlopen(request).read()
2444 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2445 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2446 return
2447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2448 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2449 return
2450
2451 def _real_extract(self, url):
2452 mobj = re.match(self._VALID_URL, url)
2453 if mobj is None:
2454 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2455 return
2456 video_id = mobj.group('ID')
2457
2458 # Get video webpage
2459 self.report_video_webpage_download(video_id)
2460 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2461 try:
2462 page = urllib2.urlopen(request)
2463 video_webpage = page.read()
2464 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2465 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2466 return
2467
2468 # Start extracting information
2469 self.report_information_extraction(video_id)
2470
2471 # Extract information
2472 video_info = self._parse_page(video_webpage)
2473
2474 # uploader
2475 if 'owner' not in video_info:
2476 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2477 return
2478 video_uploader = video_info['owner']
2479
2480 # title
2481 if 'title' not in video_info:
2482 self._downloader.trouble(u'ERROR: unable to extract video title')
2483 return
2484 video_title = video_info['title']
2485 video_title = video_title.decode('utf-8')
2486 video_title = sanitize_title(video_title)
2487
2488 # simplified title
2489 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2490 simple_title = simple_title.strip(ur'_')
2491
2492 # thumbnail image
2493 if 'thumbnail' not in video_info:
2494 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2495 video_thumbnail = ''
2496 else:
2497 video_thumbnail = video_info['thumbnail']
2498
2499 # upload date
2500 upload_date = u'NA'
2501 if 'upload_date' in video_info:
2502 upload_time = video_info['upload_date']
2503 timetuple = email.utils.parsedate_tz(upload_time)
2504 if timetuple is not None:
2505 try:
2506 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2507 except:
2508 pass
2509
2510 # description
2511 video_description = 'No description available.'
2512 if (self._downloader.params.get('forcedescription', False) and
2513 'description' in video_info):
2514 video_description = video_info['description']
2515
2516 url_map = video_info['video_urls']
2517 if len(url_map.keys()) > 0:
2518 # Decide which formats to download
2519 req_format = self._downloader.params.get('format', None)
2520 format_limit = self._downloader.params.get('format_limit', None)
2521
2522 if format_limit is not None and format_limit in self._available_formats:
2523 format_list = self._available_formats[self._available_formats.index(format_limit):]
2524 else:
2525 format_list = self._available_formats
2526 existing_formats = [x for x in format_list if x in url_map]
2527 if len(existing_formats) == 0:
2528 self._downloader.trouble(u'ERROR: no known formats available for video')
2529 return
2530 if req_format is None:
2531 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2532 elif req_format == '-1':
2533 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2534 else:
2535 # Specific format
2536 if req_format not in url_map:
2537 self._downloader.trouble(u'ERROR: requested format not available')
2538 return
2539 video_url_list = [(req_format, url_map[req_format])] # Specific format
2540
2541 for format_param, video_real_url in video_url_list:
2542
2543 # At this point we have a new video
2544 self._downloader.increment_downloads()
2545
2546 # Extension
2547 video_extension = self._video_extensions.get(format_param, 'mp4')
2548
2549 # Find the video URL in fmt_url_map or conn paramters
2550 try:
2551 # Process video information
2552 self._downloader.process_info({
2553 'id': video_id.decode('utf-8'),
2554 'url': video_real_url.decode('utf-8'),
2555 'uploader': video_uploader.decode('utf-8'),
2556 'upload_date': upload_date,
2557 'title': video_title,
2558 'stitle': simple_title,
2559 'ext': video_extension.decode('utf-8'),
2560 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2561 'thumbnail': video_thumbnail.decode('utf-8'),
2562 'description': video_description.decode('utf-8'),
2563 'player_url': None,
2564 })
2565 except UnavailableVideoError, err:
2566 self._downloader.trouble(u'\nERROR: unable to download video')
2567
65cd34c5
RG
2568class PostProcessor(object):
2569 """Post Processor class.
2570
2571 PostProcessor objects can be added to downloaders with their
2572 add_post_processor() method. When the downloader has finished a
2573 successful download, it will take its internal chain of PostProcessors
2574 and start calling the run() method on each one of them, first with
2575 an initial argument and then with the returned value of the previous
2576 PostProcessor.
2577
2578 The chain will be stopped if one of them ever returns None or the end
2579 of the chain is reached.
2580
2581 PostProcessor objects follow a "mutual registration" process similar
2582 to InfoExtractor objects.
2583 """
2584
2585 _downloader = None
2586
2587 def __init__(self, downloader=None):
2588 self._downloader = downloader
2589
65cd34c5
RG
2590 def set_downloader(self, downloader):
2591 """Sets the downloader for this PP."""
2592 self._downloader = downloader
d3975459 2593
65cd34c5
RG
2594 def run(self, information):
2595 """Run the PostProcessor.
2596
2597 The "information" argument is a dictionary like the ones
2f11508a 2598 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
2599 one has an extra field called "filepath" that points to the
2600 downloaded file.
2601
2602 When this method returns None, the postprocessing chain is
2603 stopped. However, this method may return an information
2604 dictionary that will be passed to the next postprocessing
2605 object in the chain. It can be the one it received after
2606 changing some fields.
2607
2608 In addition, this method may raise a PostProcessingError
2609 exception that will be taken into account by the downloader
2610 it was called from.
2611 """
2612 return information # by default, do nothing
d3975459 2613
3072fab1
RG
2614class FFmpegExtractAudioPP(PostProcessor):
2615
2616 def __init__(self, downloader=None, preferredcodec=None):
2617 PostProcessor.__init__(self, downloader)
2618 if preferredcodec is None:
2619 preferredcodec = 'best'
2620 self._preferredcodec = preferredcodec
2621
2622 @staticmethod
2623 def get_audio_codec(path):
da273188 2624 try:
2727dbf7
RG
2625 cmd = ['ffprobe', '-show_streams', '--', path]
2626 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
da273188
RG
2627 output = handle.communicate()[0]
2628 if handle.wait() != 0:
2629 return None
2630 except (IOError, OSError):
3072fab1
RG
2631 return None
2632 audio_codec = None
2633 for line in output.split('\n'):
2634 if line.startswith('codec_name='):
2635 audio_codec = line.split('=')[1].strip()
2636 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2637 return audio_codec
2638 return None
2639
2640 @staticmethod
2641 def run_ffmpeg(path, out_path, codec, more_opts):
2642 try:
2727dbf7
RG
2643 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2644 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3072fab1
RG
2645 return (ret == 0)
2646 except (IOError, OSError):
2647 return False
2648
2649 def run(self, information):
2650 path = information['filepath']
2651
2652 filecodec = self.get_audio_codec(path)
2653 if filecodec is None:
da273188 2654 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3072fab1
RG
2655 return None
2656
2657 more_opts = []
2658 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2659 if filecodec == 'aac' or filecodec == 'mp3':
2660 # Lossless if possible
2661 acodec = 'copy'
2662 extension = filecodec
2663 if filecodec == 'aac':
2664 more_opts = ['-f', 'adts']
2665 else:
2666 # MP3 otherwise.
2667 acodec = 'libmp3lame'
2668 extension = 'mp3'
2669 more_opts = ['-ab', '128k']
2670 else:
2671 # We convert the audio (lossy)
2672 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2673 extension = self._preferredcodec
2674 more_opts = ['-ab', '128k']
2675 if self._preferredcodec == 'aac':
2676 more_opts += ['-f', 'adts']
2677
2678 (prefix, ext) = os.path.splitext(path)
2679 new_path = prefix + '.' + extension
2680 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2681 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2682
2683 if not status:
1bd92582 2684 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3072fab1
RG
2685 return None
2686
2687 try:
2688 os.remove(path)
2689 except (IOError, OSError):
2690 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2691 return None
2692
2693 information['filepath'] = new_path
2694 return information
2695
65cd34c5 2696### MAIN PROGRAM ###
4fa74b52
RG
2697if __name__ == '__main__':
2698 try:
f9f1e798 2699 # Modules needed only when running the main program
209e9e27 2700 import getpass
f9f1e798
RG
2701 import optparse
2702
0fe64c04 2703 # Function to update the program file with the latest version from the repository.
4bec29ef
RG
2704 def update_self(downloader, filename):
2705 # Note: downloader only used for options
0fe64c04 2706 if not os.access(filename, os.W_OK):
4bec29ef
RG
2707 sys.exit('ERROR: no write permissions on %s' % filename)
2708
331ce0a0 2709 downloader.to_screen('Updating to latest stable version...')
0fe64c04
RG
2710 try:
2711 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2712 latest_version = urllib.urlopen(latest_url).read().strip()
2713 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2714 newcontent = urllib.urlopen(prog_url).read()
2715 except (IOError, OSError), err:
2716 sys.exit('ERROR: unable to download latest version')
2717 try:
2718 stream = open(filename, 'w')
2719 stream.write(newcontent)
2720 stream.close()
2721 except (IOError, OSError), err:
2722 sys.exit('ERROR: unable to overwrite current version')
331ce0a0 2723 downloader.to_screen('Updated to version %s' % latest_version)
4bec29ef 2724
f9f1e798 2725 # Parse command line
209e9e27 2726 parser = optparse.OptionParser(
7b7759f5 2727 usage='Usage: %prog [options] url...',
33d507f1 2728 version='2011.08.04',
7b7759f5 2729 conflict_handler='resolve',
2730 )
2731
209e9e27
RG
2732 parser.add_option('-h', '--help',
2733 action='help', help='print this help text and exit')
2734 parser.add_option('-v', '--version',
2735 action='version', help='print program version and exit')
4bec29ef
RG
2736 parser.add_option('-U', '--update',
2737 action='store_true', dest='update_self', help='update this program to latest stable version')
7b7759f5 2738 parser.add_option('-i', '--ignore-errors',
2739 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2740 parser.add_option('-r', '--rate-limit',
2b06c33d 2741 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
7031008c 2742 parser.add_option('-R', '--retries',
2b06c33d 2743 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
204c9398
RG
2744 parser.add_option('--playlist-start',
2745 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
8cc44341
RG
2746 parser.add_option('--playlist-end',
2747 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
e7cf18cb 2748 parser.add_option('--dump-user-agent',
6025795d
RG
2749 action='store_true', dest='dump_user_agent',
2750 help='display the current browser identification', default=False)
7b7759f5 2751
2752 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2753 authentication.add_option('-u', '--username',
2b06c33d 2754 dest='username', metavar='USERNAME', help='account username')
7b7759f5 2755 authentication.add_option('-p', '--password',
2b06c33d 2756 dest='password', metavar='PASSWORD', help='account password')
7b7759f5 2757 authentication.add_option('-n', '--netrc',
209e9e27 2758 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
7b7759f5 2759 parser.add_option_group(authentication)
2760
2761 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2762 video_format.add_option('-f', '--format',
2b06c33d 2763 action='store', dest='format', metavar='FORMAT', help='video format code')
6ba562b0
RG
2764 video_format.add_option('--all-formats',
2765 action='store_const', dest='format', help='download all available video formats', const='-1')
f2413e67 2766 video_format.add_option('--max-quality',
460d8acb 2767 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
7b7759f5 2768 parser.add_option_group(video_format)
2769
2770 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2771 verbosity.add_option('-q', '--quiet',
2772 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2773 verbosity.add_option('-s', '--simulate',
2774 action='store_true', dest='simulate', help='do not download video', default=False)
2775 verbosity.add_option('-g', '--get-url',
2776 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2777 verbosity.add_option('-e', '--get-title',
2778 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
7e58d568 2779 verbosity.add_option('--get-thumbnail',
6025795d
RG
2780 action='store_true', dest='getthumbnail',
2781 help='simulate, quiet but print thumbnail URL', default=False)
7e58d568 2782 verbosity.add_option('--get-description',
6025795d
RG
2783 action='store_true', dest='getdescription',
2784 help='simulate, quiet but print video description', default=False)
9f796346 2785 verbosity.add_option('--get-filename',
6025795d
RG
2786 action='store_true', dest='getfilename',
2787 help='simulate, quiet but print output filename', default=False)
d9835247
RG
2788 verbosity.add_option('--no-progress',
2789 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
ccbd296b 2790 verbosity.add_option('--console-title',
6025795d
RG
2791 action='store_true', dest='consoletitle',
2792 help='display progress in console titlebar', default=False)
7b7759f5 2793 parser.add_option_group(verbosity)
2794
2795 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1c76e23e
RG
2796 filesystem.add_option('-t', '--title',
2797 action='store_true', dest='usetitle', help='use title in file name', default=False)
2798 filesystem.add_option('-l', '--literal',
2799 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1e47d226 2800 filesystem.add_option('-A', '--auto-number',
6025795d
RG
2801 action='store_true', dest='autonumber',
2802 help='number downloaded files starting from 00000', default=False)
7b7759f5 2803 filesystem.add_option('-o', '--output',
2b06c33d 2804 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
7b7759f5 2805 filesystem.add_option('-a', '--batch-file',
2b06c33d 2806 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
7b7759f5 2807 filesystem.add_option('-w', '--no-overwrites',
0beeff4b 2808 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
f76c2df6
PI
2809 filesystem.add_option('-c', '--continue',
2810 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
80066952
RG
2811 filesystem.add_option('--cookies',
2812 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3fb2c487
RG
2813 filesystem.add_option('--no-part',
2814 action='store_true', dest='nopart', help='do not use .part files', default=False)
e3018902
RG
2815 filesystem.add_option('--no-mtime',
2816 action='store_false', dest='updatetime',
2817 help='do not use the Last-modified header to set the file modification time', default=True)
7b7759f5 2818 parser.add_option_group(filesystem)
2819
3072fab1
RG
2820 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2821 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2822 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2823 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2824 help='"best", "aac" or "mp3"; best by default')
2825 parser.add_option_group(postproc)
2826
209e9e27 2827 (opts, args) = parser.parse_args()
2a7353b8 2828
80066952
RG
2829 # Open appropriate CookieJar
2830 if opts.cookiefile is None:
2831 jar = cookielib.CookieJar()
2832 else:
2833 try:
2834 jar = cookielib.MozillaCookieJar(opts.cookiefile)
e0c982c8
RG
2835 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2836 jar.load()
80066952
RG
2837 except (IOError, OSError), err:
2838 sys.exit(u'ERROR: unable to open cookie file')
2839
e7cf18cb
RG
2840 # Dump user agent
2841 if opts.dump_user_agent:
2842 print std_headers['User-Agent']
2843 sys.exit(0)
2844
80066952
RG
2845 # General configuration
2846 cookie_processor = urllib2.HTTPCookieProcessor(jar)
1987c232 2847 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
80066952
RG
2848 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2849
c6fd0bb8 2850 # Batch file verification
d1580ed9 2851 batchurls = []
c6fd0bb8
RG
2852 if opts.batchfile is not None:
2853 try:
2a7353b8
RG
2854 if opts.batchfile == '-':
2855 batchfd = sys.stdin
2856 else:
2857 batchfd = open(opts.batchfile, 'r')
2858 batchurls = batchfd.readlines()
b65740e4 2859 batchurls = [x.strip() for x in batchurls]
817e8f52 2860 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
c6fd0bb8
RG
2861 except IOError:
2862 sys.exit(u'ERROR: batch file could not be read')
2863 all_urls = batchurls + args
2864
209e9e27 2865 # Conflicting, missing and erroneous options
209e9e27 2866 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2740c509 2867 parser.error(u'using .netrc conflicts with giving username/password')
209e9e27 2868 if opts.password is not None and opts.username is None:
2740c509 2869 parser.error(u'account username missing')
1e47d226
NA
2870 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2871 parser.error(u'using output template conflicts with using title, literal title or auto number')
209e9e27 2872 if opts.usetitle and opts.useliteral:
2740c509 2873 parser.error(u'using title conflicts with using literal title')
209e9e27 2874 if opts.username is not None and opts.password is None:
76a7f364 2875 opts.password = getpass.getpass(u'Type account password and press return:')
acd3d842
RG
2876 if opts.ratelimit is not None:
2877 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2878 if numeric_limit is None:
2740c509 2879 parser.error(u'invalid rate limit specified')
acd3d842 2880 opts.ratelimit = numeric_limit
7031008c
RG
2881 if opts.retries is not None:
2882 try:
2883 opts.retries = long(opts.retries)
2884 except (TypeError, ValueError), err:
2885 parser.error(u'invalid retry count specified')
8cc44341
RG
2886 try:
2887 opts.playliststart = long(opts.playliststart)
2888 if opts.playliststart <= 0:
2889 raise ValueError
2890 except (TypeError, ValueError), err:
2891 parser.error(u'invalid playlist start number specified')
2892 try:
2893 opts.playlistend = long(opts.playlistend)
2894 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2895 raise ValueError
2896 except (TypeError, ValueError), err:
2897 parser.error(u'invalid playlist end number specified')
3072fab1
RG
2898 if opts.extractaudio:
2899 if opts.audioformat not in ['best', 'aac', 'mp3']:
2900 parser.error(u'invalid audio format specified')
4fa74b52
RG
2901
2902 # Information extractors
2903 youtube_ie = YoutubeIE()
020f7150 2904 metacafe_ie = MetacafeIE(youtube_ie)
4135fa45 2905 dailymotion_ie = DailymotionIE()
0c2dc87d 2906 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
c39c05cd 2907 youtube_user_ie = YoutubeUserIE(youtube_ie)
25af2bce 2908 youtube_search_ie = YoutubeSearchIE(youtube_ie)
49c0028a 2909 google_ie = GoogleIE()
7e58d568 2910 google_search_ie = GoogleSearchIE(google_ie)
49c0028a 2911 photobucket_ie = PhotobucketIE()
61945318 2912 yahoo_ie = YahooIE()
7e58d568 2913 yahoo_search_ie = YahooSearchIE(yahoo_ie)
27179cfd 2914 deposit_files_ie = DepositFilesIE()
9f5f9602 2915 facebook_ie = FacebookIE()
490fd7ae 2916 generic_ie = GenericIE()
4fa74b52
RG
2917
2918 # File downloader
9fcd8355 2919 fd = FileDownloader({
209e9e27
RG
2920 'usenetrc': opts.usenetrc,
2921 'username': opts.username,
2922 'password': opts.password,
9f796346 2923 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
209e9e27
RG
2924 'forceurl': opts.geturl,
2925 'forcetitle': opts.gettitle,
7e58d568
RG
2926 'forcethumbnail': opts.getthumbnail,
2927 'forcedescription': opts.getdescription,
9f796346
GI
2928 'forcefilename': opts.getfilename,
2929 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
320becd6 2930 'format': opts.format,
f2413e67 2931 'format_limit': opts.format_limit,
eae2666c 2932 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
6ba562b0
RG
2933 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2934 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2935 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
1e47d226
NA
2936 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2937 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
76a7f364
RG
2938 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2939 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1e47d226 2940 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
76a7f364 2941 or u'%(id)s.%(ext)s'),
0086d1ec 2942 'ignoreerrors': opts.ignoreerrors,
acd3d842 2943 'ratelimit': opts.ratelimit,
0beeff4b 2944 'nooverwrites': opts.nooverwrites,
7031008c 2945 'retries': opts.retries,
7db85b2c 2946 'continuedl': opts.continue_dl,
d9835247 2947 'noprogress': opts.noprogress,
204c9398 2948 'playliststart': opts.playliststart,
8cc44341 2949 'playlistend': opts.playlistend,
331ce0a0 2950 'logtostderr': opts.outtmpl == '-',
ccbd296b 2951 'consoletitle': opts.consoletitle,
3fb2c487 2952 'nopart': opts.nopart,
e3018902 2953 'updatetime': opts.updatetime,
9fcd8355 2954 })
25af2bce 2955 fd.add_info_extractor(youtube_search_ie)
0c2dc87d 2956 fd.add_info_extractor(youtube_pl_ie)
c39c05cd 2957 fd.add_info_extractor(youtube_user_ie)
020f7150 2958 fd.add_info_extractor(metacafe_ie)
4135fa45 2959 fd.add_info_extractor(dailymotion_ie)
4fa74b52 2960 fd.add_info_extractor(youtube_ie)
49c0028a 2961 fd.add_info_extractor(google_ie)
7e58d568 2962 fd.add_info_extractor(google_search_ie)
49c0028a 2963 fd.add_info_extractor(photobucket_ie)
61945318 2964 fd.add_info_extractor(yahoo_ie)
7e58d568 2965 fd.add_info_extractor(yahoo_search_ie)
27179cfd 2966 fd.add_info_extractor(deposit_files_ie)
9f5f9602 2967 fd.add_info_extractor(facebook_ie)
4bec29ef 2968
490fd7ae
RG
2969 # This must come last since it's the
2970 # fallback if none of the others work
2971 fd.add_info_extractor(generic_ie)
2972
3072fab1
RG
2973 # PostProcessors
2974 if opts.extractaudio:
2975 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
2976
4bec29ef
RG
2977 # Update version
2978 if opts.update_self:
2979 update_self(fd, sys.argv[0])
2980
2981 # Maybe do nothing
2982 if len(all_urls) < 1:
2983 if not opts.update_self:
2984 parser.error(u'you must provide at least one URL')
2985 else:
2986 sys.exit()
c6fd0bb8 2987 retcode = fd.download(all_urls)
80066952
RG
2988
2989 # Dump cookie jar if requested
2990 if opts.cookiefile is not None:
2991 try:
2992 jar.save()
2993 except (IOError, OSError), err:
2994 sys.exit(u'ERROR: unable to save cookie jar')
2995
bb681b88 2996 sys.exit(retcode)
4fa74b52 2997
e5bf0f55
RG
2998 except DownloadError:
2999 sys.exit(1)
3000 except SameFileError:
76a7f364 3001 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 3002 except KeyboardInterrupt:
76a7f364 3003 sys.exit(u'\nERROR: Interrupted by user')
e9cb9c28
GV
3004
3005# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: