]> jfr.im git - yt-dlp.git/blob - youtube-dl
Robust error handling in downloading code
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 )
15
16 __license__ = 'Public Domain'
17 __version__ = '2011.08.28-phihag'
18
19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
20
21 import cookielib
22 import datetime
23 import gzip
24 import htmlentitydefs
25 import httplib
26 import locale
27 import math
28 import netrc
29 import os
30 import os.path
31 import re
32 import socket
33 import string
34 import subprocess
35 import sys
36 import time
37 import urllib
38 import urllib2
39 import warnings
40 import zlib
41
42 if os.name == 'nt':
43 import ctypes
44
45 try:
46 import email.utils
47 except ImportError: # Python 2.4
48 import email.Utils
49 try:
50 import cStringIO as StringIO
51 except ImportError:
52 import StringIO
53
54 # parse_qs was moved from the cgi module to the urlparse module recently.
55 try:
56 from urlparse import parse_qs
57 except ImportError:
58 from cgi import parse_qs
59
60 try:
61 import lxml.etree
62 except ImportError:
63 pass # Handled below
64
65 std_headers = {
66 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
67 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 'Accept-Encoding': 'gzip, deflate',
70 'Accept-Language': 'en-us,en;q=0.5',
71 }
72
73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
74
75 try:
76 import json
77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
78 import re
79 class json(object):
80 @staticmethod
81 def loads(s):
82 s = s.decode('UTF-8')
83 def raiseError(msg, i):
84 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
85 def skipSpace(i, expectMore=True):
86 while i < len(s) and s[i] in ' \t\r\n':
87 i += 1
88 if expectMore:
89 if i >= len(s):
90 raiseError('Premature end', i)
91 return i
92 def decodeEscape(match):
93 esc = match.group(1)
94 _STATIC = {
95 '"': '"',
96 '\\': '\\',
97 '/': '/',
98 'b': unichr(0x8),
99 'f': unichr(0xc),
100 'n': '\n',
101 'r': '\r',
102 't': '\t',
103 }
104 if esc in _STATIC:
105 return _STATIC[esc]
106 if esc[0] == 'u':
107 if len(esc) == 1+4:
108 return unichr(int(esc[1:5], 16))
109 if len(esc) == 5+6 and esc[5:7] == '\\u':
110 hi = int(esc[1:5], 16)
111 low = int(esc[7:11], 16)
112 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
113 raise ValueError('Unknown escape ' + str(esc))
114 def parseString(i):
115 i += 1
116 e = i
117 while True:
118 e = s.index('"', e)
119 bslashes = 0
120 while s[e-bslashes-1] == '\\':
121 bslashes += 1
122 if bslashes % 2 == 1:
123 e += 1
124 continue
125 break
126 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
127 stri = rexp.sub(decodeEscape, s[i:e])
128 return (e+1,stri)
129 def parseObj(i):
130 i += 1
131 res = {}
132 i = skipSpace(i)
133 if s[i] == '}': # Empty dictionary
134 return (i+1,res)
135 while True:
136 if s[i] != '"':
137 raiseError('Expected a string object key', i)
138 i,key = parseString(i)
139 i = skipSpace(i)
140 if i >= len(s) or s[i] != ':':
141 raiseError('Expected a colon', i)
142 i,val = parse(i+1)
143 res[key] = val
144 i = skipSpace(i)
145 if s[i] == '}':
146 return (i+1, res)
147 if s[i] != ',':
148 raiseError('Expected comma or closing curly brace', i)
149 i = skipSpace(i+1)
150 def parseArray(i):
151 res = []
152 i = skipSpace(i+1)
153 if s[i] == ']': # Empty array
154 return (i+1,res)
155 while True:
156 i,val = parse(i)
157 res.append(val)
158 i = skipSpace(i) # Raise exception if premature end
159 if s[i] == ']':
160 return (i+1, res)
161 if s[i] != ',':
162 raiseError('Expected a comma or closing bracket', i)
163 i = skipSpace(i+1)
164 def parseDiscrete(i):
165 for k,v in {'true': True, 'false': False, 'null': None}.items():
166 if s.startswith(k, i):
167 return (i+len(k), v)
168 raiseError('Not a boolean (or null)', i)
169 def parseNumber(i):
170 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
171 if mobj is None:
172 raiseError('Not a number', i)
173 nums = mobj.group(1)
174 if '.' in nums or 'e' in nums or 'E' in nums:
175 return (i+len(nums), float(nums))
176 return (i+len(nums), int(nums))
177 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
178 def parse(i):
179 i = skipSpace(i)
180 i,res = CHARMAP.get(s[i], parseNumber)(i)
181 i = skipSpace(i, False)
182 return (i,res)
183 i,res = parse(0)
184 if i < len(s):
185 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
186 return res
187
188 def preferredencoding():
189 """Get preferred encoding.
190
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
193 """
194 def yield_preferredencoding():
195 try:
196 pref = locale.getpreferredencoding()
197 u'TEST'.encode(pref)
198 except:
199 pref = 'UTF-8'
200 while True:
201 yield pref
202 return yield_preferredencoding().next()
203
204 def htmlentity_transform(matchobj):
205 """Transforms an HTML entity to a Unicode character.
206
207 This function receives a match object and is intended to be used with
208 the re.sub() function.
209 """
210 entity = matchobj.group(1)
211
212 # Known non-numeric HTML entity
213 if entity in htmlentitydefs.name2codepoint:
214 return unichr(htmlentitydefs.name2codepoint[entity])
215
216 # Unicode character
217 mobj = re.match(ur'(?u)#(x?\d+)', entity)
218 if mobj is not None:
219 numstr = mobj.group(1)
220 if numstr.startswith(u'x'):
221 base = 16
222 numstr = u'0%s' % numstr
223 else:
224 base = 10
225 return unichr(long(numstr, base))
226
227 # Unknown entity in name, return its literal representation
228 return (u'&%s;' % entity)
229
230 def sanitize_title(utitle):
231 """Sanitizes a video title so it could be used as part of a filename."""
232 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
233 return utitle.replace(unicode(os.sep), u'%')
234
235 def sanitize_open(filename, open_mode):
236 """Try to open the given filename, and slightly tweak it if this fails.
237
238 Attempts to open the given filename. If this fails, it tries to change
239 the filename slightly, step by step, until it's either able to open it
240 or it fails and raises a final exception, like the standard open()
241 function.
242
243 It returns the tuple (stream, definitive_file_name).
244 """
245 try:
246 if filename == u'-':
247 if sys.platform == 'win32':
248 import msvcrt
249 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
250 return (sys.stdout, filename)
251 stream = open(filename, open_mode)
252 return (stream, filename)
253 except (IOError, OSError), err:
254 # In case of error, try to remove win32 forbidden chars
255 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
256
257 # An exception here should be caught in the caller
258 stream = open(filename, open_mode)
259 return (stream, filename)
260
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
263 timestamp = None
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
267 return timestamp
268
269 class DownloadError(Exception):
270 """Download Error exception.
271
272 This exception may be thrown by FileDownloader objects if they are not
273 configured to continue on errors. They will contain the appropriate
274 error message.
275 """
276 pass
277
278 class SameFileError(Exception):
279 """Same File exception.
280
281 This exception will be thrown by FileDownloader objects if they detect
282 multiple files would have to be downloaded to the same file on disk.
283 """
284 pass
285
286 class PostProcessingError(Exception):
287 """Post Processing exception.
288
289 This exception may be raised by PostProcessor's .run() method to
290 indicate an error in the postprocessing task.
291 """
292 pass
293
294 class UnavailableVideoError(Exception):
295 """Unavailable Format exception.
296
297 This exception will be thrown when a video is requested
298 in a format that is not available for that video.
299 """
300 pass
301
302 class ContentTooShortError(Exception):
303 """Content Too Short exception.
304
305 This exception may be raised by FileDownloader objects when a file they
306 download is too small for what the server announced first, indicating
307 the connection was probably interrupted.
308 """
309 # Both in bytes
310 downloaded = None
311 expected = None
312
313 def __init__(self, downloaded, expected):
314 self.downloaded = downloaded
315 self.expected = expected
316
317 class YoutubeDLHandler(urllib2.HTTPHandler):
318 """Handler for HTTP requests and responses.
319
320 This class, when installed with an OpenerDirector, automatically adds
321 the standard headers to every HTTP request and handles gzipped and
322 deflated responses from web servers. If compression is to be avoided in
323 a particular request, the original request in the program code only has
324 to include the HTTP header "Youtubedl-No-Compression", which will be
325 removed before making the real request.
326
327 Part of this code was copied from:
328
329 http://techknack.net/python-urllib2-handlers/
330
331 Andrew Rowls, the author of that code, agreed to release it to the
332 public domain.
333 """
334
335 @staticmethod
336 def deflate(data):
337 try:
338 return zlib.decompress(data, -zlib.MAX_WBITS)
339 except zlib.error:
340 return zlib.decompress(data)
341
342 @staticmethod
343 def addinfourl_wrapper(stream, headers, url, code):
344 if hasattr(urllib2.addinfourl, 'getcode'):
345 return urllib2.addinfourl(stream, headers, url, code)
346 ret = urllib2.addinfourl(stream, headers, url)
347 ret.code = code
348 return ret
349
350 def http_request(self, req):
351 for h in std_headers:
352 if h in req.headers:
353 del req.headers[h]
354 req.add_header(h, std_headers[h])
355 if 'Youtubedl-no-compression' in req.headers:
356 if 'Accept-encoding' in req.headers:
357 del req.headers['Accept-encoding']
358 del req.headers['Youtubedl-no-compression']
359 return req
360
361 def http_response(self, req, resp):
362 old_resp = resp
363 # gzip
364 if resp.headers.get('Content-encoding', '') == 'gzip':
365 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
366 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
367 resp.msg = old_resp.msg
368 # deflate
369 if resp.headers.get('Content-encoding', '') == 'deflate':
370 gz = StringIO.StringIO(self.deflate(resp.read()))
371 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
372 resp.msg = old_resp.msg
373 return resp
374
375 class FileDownloader(object):
376 """File Downloader class.
377
378 File downloader objects are the ones responsible of downloading the
379 actual video file and writing it to disk if the user has requested
380 it, among some other tasks. In most cases there should be one per
381 program. As, given a video URL, the downloader doesn't know how to
382 extract all the needed information, task that InfoExtractors do, it
383 has to pass the URL to one of them.
384
385 For this, file downloader objects have a method that allows
386 InfoExtractors to be registered in a given order. When it is passed
387 a URL, the file downloader handles it to the first InfoExtractor it
388 finds that reports being able to handle it. The InfoExtractor extracts
389 all the information about the video or videos the URL refers to, and
390 asks the FileDownloader to process the video information, possibly
391 downloading the video.
392
393 File downloaders accept a lot of parameters. In order not to saturate
394 the object constructor with arguments, it receives a dictionary of
395 options instead. These options are available through the params
396 attribute for the InfoExtractors to use. The FileDownloader also
397 registers itself as the downloader in charge for the InfoExtractors
398 that are added to it, so this is a "mutual registration".
399
400 Available options:
401
402 username: Username for authentication purposes.
403 password: Password for authentication purposes.
404 usenetrc: Use netrc for authentication instead.
405 quiet: Do not print messages to stdout.
406 forceurl: Force printing final URL.
407 forcetitle: Force printing title.
408 forcethumbnail: Force printing thumbnail URL.
409 forcedescription: Force printing description.
410 forcefilename: Force printing final filename.
411 simulate: Do not download the video files.
412 format: Video format code.
413 format_limit: Highest quality format to try.
414 outtmpl: Template for output names.
415 ignoreerrors: Do not stop on download errors.
416 ratelimit: Download speed limit, in bytes/sec.
417 nooverwrites: Prevent overwriting files.
418 retries: Number of times to retry for HTTP error 5xx
419 continuedl: Try to continue downloads if possible.
420 noprogress: Do not print the progress bar.
421 playliststart: Playlist item to start at.
422 playlistend: Playlist item to end at.
423 logtostderr: Log messages to stderr instead of stdout.
424 consoletitle: Display progress in console window's titlebar.
425 nopart: Do not use temporary .part files.
426 updatetime: Use the Last-modified header to set output file timestamps.
427 writedescription: Write the video description to a .description file
428 writeinfojson: Write the video description to a .info.json file
429 """
430
431 params = None
432 _ies = []
433 _pps = []
434 _download_retcode = None
435 _num_downloads = None
436 _screen_file = None
437
438 def __init__(self, params):
439 """Create a FileDownloader object with the given options."""
440 self._ies = []
441 self._pps = []
442 self._download_retcode = 0
443 self._num_downloads = 0
444 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
445 self.params = params
446
447 @staticmethod
448 def pmkdir(filename):
449 """Create directory components in filename. Similar to Unix "mkdir -p"."""
450 components = filename.split(os.sep)
451 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
452 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
453 for dir in aggregate:
454 if not os.path.exists(dir):
455 os.mkdir(dir)
456
457 @staticmethod
458 def format_bytes(bytes):
459 if bytes is None:
460 return 'N/A'
461 if type(bytes) is str:
462 bytes = float(bytes)
463 if bytes == 0.0:
464 exponent = 0
465 else:
466 exponent = long(math.log(bytes, 1024.0))
467 suffix = 'bkMGTPEZY'[exponent]
468 converted = float(bytes) / float(1024**exponent)
469 return '%.2f%s' % (converted, suffix)
470
471 @staticmethod
472 def calc_percent(byte_counter, data_len):
473 if data_len is None:
474 return '---.-%'
475 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
476
477 @staticmethod
478 def calc_eta(start, now, total, current):
479 if total is None:
480 return '--:--'
481 dif = now - start
482 if current == 0 or dif < 0.001: # One millisecond
483 return '--:--'
484 rate = float(current) / dif
485 eta = long((float(total) - float(current)) / rate)
486 (eta_mins, eta_secs) = divmod(eta, 60)
487 if eta_mins > 99:
488 return '--:--'
489 return '%02d:%02d' % (eta_mins, eta_secs)
490
491 @staticmethod
492 def calc_speed(start, now, bytes):
493 dif = now - start
494 if bytes == 0 or dif < 0.001: # One millisecond
495 return '%10s' % '---b/s'
496 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
497
498 @staticmethod
499 def best_block_size(elapsed_time, bytes):
500 new_min = max(bytes / 2.0, 1.0)
501 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
502 if elapsed_time < 0.001:
503 return long(new_max)
504 rate = bytes / elapsed_time
505 if rate > new_max:
506 return long(new_max)
507 if rate < new_min:
508 return long(new_min)
509 return long(rate)
510
511 @staticmethod
512 def parse_bytes(bytestr):
513 """Parse a string indicating a byte quantity into a long integer."""
514 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
515 if matchobj is None:
516 return None
517 number = float(matchobj.group(1))
518 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
519 return long(round(number * multiplier))
520
521 def add_info_extractor(self, ie):
522 """Add an InfoExtractor object to the end of the list."""
523 self._ies.append(ie)
524 ie.set_downloader(self)
525
526 def add_post_processor(self, pp):
527 """Add a PostProcessor object to the end of the chain."""
528 self._pps.append(pp)
529 pp.set_downloader(self)
530
531 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
532 """Print message to stdout if not in quiet mode."""
533 try:
534 if not self.params.get('quiet', False):
535 terminator = [u'\n', u''][skip_eol]
536 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
537 self._screen_file.flush()
538 except (UnicodeEncodeError), err:
539 if not ignore_encoding_errors:
540 raise
541
542 def to_stderr(self, message):
543 """Print message to stderr."""
544 print >>sys.stderr, message.encode(preferredencoding())
545
546 def to_cons_title(self, message):
547 """Set console/terminal window title to message."""
548 if not self.params.get('consoletitle', False):
549 return
550 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
551 # c_wchar_p() might not be necessary if `message` is
552 # already of type unicode()
553 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
554 elif 'TERM' in os.environ:
555 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
556
557 def fixed_template(self):
558 """Checks if the output template is fixed."""
559 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
560
561 def trouble(self, message=None):
562 """Determine action to take when a download problem appears.
563
564 Depending on if the downloader has been configured to ignore
565 download errors or not, this method may throw an exception or
566 not when errors are found, after printing the message.
567 """
568 if message is not None:
569 self.to_stderr(message)
570 if not self.params.get('ignoreerrors', False):
571 raise DownloadError(message)
572 self._download_retcode = 1
573
574 def slow_down(self, start_time, byte_counter):
575 """Sleep if the download speed is over the rate limit."""
576 rate_limit = self.params.get('ratelimit', None)
577 if rate_limit is None or byte_counter == 0:
578 return
579 now = time.time()
580 elapsed = now - start_time
581 if elapsed <= 0.0:
582 return
583 speed = float(byte_counter) / elapsed
584 if speed > rate_limit:
585 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
586
587 def temp_name(self, filename):
588 """Returns a temporary filename for the given filename."""
589 if self.params.get('nopart', False) or filename == u'-' or \
590 (os.path.exists(filename) and not os.path.isfile(filename)):
591 return filename
592 return filename + u'.part'
593
594 def undo_temp_name(self, filename):
595 if filename.endswith(u'.part'):
596 return filename[:-len(u'.part')]
597 return filename
598
599 def try_rename(self, old_filename, new_filename):
600 try:
601 if old_filename == new_filename:
602 return
603 os.rename(old_filename, new_filename)
604 except (IOError, OSError), err:
605 self.trouble(u'ERROR: unable to rename file')
606
607 def try_utime(self, filename, last_modified_hdr):
608 """Try to set the last-modified time of the given file."""
609 if last_modified_hdr is None:
610 return
611 if not os.path.isfile(filename):
612 return
613 timestr = last_modified_hdr
614 if timestr is None:
615 return
616 filetime = timeconvert(timestr)
617 if filetime is None:
618 return
619 try:
620 os.utime(filename,(time.time(), filetime))
621 except:
622 pass
623
624 def report_writedescription(self, descfn):
625 """ Report that the description file is being written """
626 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
627
628 def report_writeinfojson(self, infofn):
629 """ Report that the metadata file has been written """
630 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
631
632 def report_destination(self, filename):
633 """Report destination filename."""
634 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
635
636 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
637 """Report download progress."""
638 if self.params.get('noprogress', False):
639 return
640 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
641 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
642 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
643 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
644
645 def report_resuming_byte(self, resume_len):
646 """Report attempt to resume at given byte."""
647 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
648
649 def report_retry(self, count, retries):
650 """Report retry in case of HTTP error 5xx"""
651 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
652
653 def report_file_already_downloaded(self, file_name):
654 """Report file has already been fully downloaded."""
655 try:
656 self.to_screen(u'[download] %s has already been downloaded' % file_name)
657 except (UnicodeEncodeError), err:
658 self.to_screen(u'[download] The file has already been downloaded')
659
660 def report_unable_to_resume(self):
661 """Report it was impossible to resume download."""
662 self.to_screen(u'[download] Unable to resume')
663
664 def report_finish(self):
665 """Report download finished."""
666 if self.params.get('noprogress', False):
667 self.to_screen(u'[download] Download completed')
668 else:
669 self.to_screen(u'')
670
671 def increment_downloads(self):
672 """Increment the ordinal that assigns a number to each file."""
673 self._num_downloads += 1
674
675 def prepare_filename(self, info_dict):
676 """Generate the output filename."""
677 try:
678 template_dict = dict(info_dict)
679 template_dict['epoch'] = unicode(long(time.time()))
680 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
681 filename = self.params['outtmpl'] % template_dict
682 return filename
683 except (ValueError, KeyError), err:
684 self.trouble(u'ERROR: invalid system charset or erroneous output template')
685 return None
686
687 def process_info(self, info_dict):
688 """Process a single dictionary returned by an InfoExtractor."""
689 filename = self.prepare_filename(info_dict)
690 # Do nothing else if in simulate mode
691 if self.params.get('simulate', False):
692 # Forced printings
693 if self.params.get('forcetitle', False):
694 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
695 if self.params.get('forceurl', False):
696 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
697 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
698 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
699 if self.params.get('forcedescription', False) and 'description' in info_dict:
700 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
701 if self.params.get('forcefilename', False) and filename is not None:
702 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
703
704 return
705
706 if filename is None:
707 return
708 if self.params.get('nooverwrites', False) and os.path.exists(filename):
709 self.to_stderr(u'WARNING: file exists and will be skipped')
710 return
711
712 try:
713 self.pmkdir(filename)
714 except (OSError, IOError), err:
715 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
716 return
717
718 if self.params.get('writedescription', False):
719 try:
720 descfn = filename + '.description'
721 self.report_writedescription(descfn)
722 descfile = open(descfn, 'wb')
723 try:
724 descfile.write(info_dict['description'].encode('utf-8'))
725 finally:
726 descfile.close()
727 except (OSError, IOError):
728 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
729 return
730
731 if self.params.get('writeinfojson', False):
732 infofn = filename + '.info.json'
733 self.report_writeinfojson(infofn)
734 try:
735 json.dump
736 except (NameError,AttributeError):
737 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
738 return
739 try:
740 infof = open(infofn, 'wb')
741 try:
742 json.dump(info_dict, infof)
743 finally:
744 infof.close()
745 except (OSError, IOError):
746 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
747 return
748
749 try:
750 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
751 except (OSError, IOError), err:
752 raise UnavailableVideoError
753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
754 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
755 return
756 except (ContentTooShortError, ), err:
757 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
758 return
759
760 if success:
761 try:
762 self.post_process(filename, info_dict)
763 except (PostProcessingError), err:
764 self.trouble(u'ERROR: postprocessing: %s' % str(err))
765 return
766
767 def download(self, url_list):
768 """Download a given list of URLs."""
769 if len(url_list) > 1 and self.fixed_template():
770 raise SameFileError(self.params['outtmpl'])
771
772 for url in url_list:
773 suitable_found = False
774 for ie in self._ies:
775 # Go to next InfoExtractor if not suitable
776 if not ie.suitable(url):
777 continue
778
779 # Suitable InfoExtractor found
780 suitable_found = True
781
782 # Extract information from URL and process it
783 ie.extract(url)
784
785 # Suitable InfoExtractor had been found; go to next URL
786 break
787
788 if not suitable_found:
789 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
790
791 return self._download_retcode
792
793 def post_process(self, filename, ie_info):
794 """Run the postprocessing chain on the given file."""
795 info = dict(ie_info)
796 info['filepath'] = filename
797 for pp in self._pps:
798 info = pp.run(info)
799 if info is None:
800 break
801
802 def _download_with_rtmpdump(self, filename, url, player_url):
803 self.report_destination(filename)
804 tmpfilename = self.temp_name(filename)
805
806 # Check for rtmpdump first
807 try:
808 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
809 except (OSError, IOError):
810 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
811 return False
812
813 # Download using rtmpdump. rtmpdump returns exit code 2 when
814 # the connection was interrumpted and resuming appears to be
815 # possible. This is part of rtmpdump's normal usage, AFAIK.
816 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
817 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
818 while retval == 2 or retval == 1:
819 prevsize = os.path.getsize(tmpfilename)
820 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
821 time.sleep(5.0) # This seems to be needed
822 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
823 cursize = os.path.getsize(tmpfilename)
824 if prevsize == cursize and retval == 1:
825 break
826 if retval == 0:
827 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
828 self.try_rename(tmpfilename, filename)
829 return True
830 else:
831 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
832 return False
833
834 def _do_download(self, filename, url, player_url):
835 # Check file already present
836 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
837 self.report_file_already_downloaded(filename)
838 return True
839
840 # Attempt to download using rtmpdump
841 if url.startswith('rtmp'):
842 return self._download_with_rtmpdump(filename, url, player_url)
843
844 tmpfilename = self.temp_name(filename)
845 stream = None
846 open_mode = 'wb'
847
848 # Do not include the Accept-Encoding header
849 headers = {'Youtubedl-no-compression': 'True'}
850 basic_request = urllib2.Request(url, None, headers)
851 request = urllib2.Request(url, None, headers)
852
853 # Establish possible resume length
854 if os.path.isfile(tmpfilename):
855 resume_len = os.path.getsize(tmpfilename)
856 else:
857 resume_len = 0
858
859 # Request parameters in case of being able to resume
860 if self.params.get('continuedl', False) and resume_len != 0:
861 self.report_resuming_byte(resume_len)
862 request.add_header('Range','bytes=%d-' % resume_len)
863 open_mode = 'ab'
864
865 count = 0
866 retries = self.params.get('retries', 0)
867 while count <= retries:
868 # Establish connection
869 try:
870 data = urllib2.urlopen(request)
871 break
872 except (urllib2.HTTPError, ), err:
873 if (err.code < 500 or err.code >= 600) and err.code != 416:
874 # Unexpected HTTP error
875 raise
876 elif err.code == 416:
877 # Unable to resume (requested range not satisfiable)
878 try:
879 # Open the connection again without the range header
880 data = urllib2.urlopen(basic_request)
881 content_length = data.info()['Content-Length']
882 except (urllib2.HTTPError, ), err:
883 if err.code < 500 or err.code >= 600:
884 raise
885 else:
886 # Examine the reported length
887 if (content_length is not None and
888 (resume_len - 100 < long(content_length) < resume_len + 100)):
889 # The file had already been fully downloaded.
890 # Explanation to the above condition: in issue #175 it was revealed that
891 # YouTube sometimes adds or removes a few bytes from the end of the file,
892 # changing the file size slightly and causing problems for some users. So
893 # I decided to implement a suggested change and consider the file
894 # completely downloaded if the file size differs less than 100 bytes from
895 # the one in the hard drive.
896 self.report_file_already_downloaded(filename)
897 self.try_rename(tmpfilename, filename)
898 return True
899 else:
900 # The length does not match, we start the download over
901 self.report_unable_to_resume()
902 open_mode = 'wb'
903 break
904 # Retry
905 count += 1
906 if count <= retries:
907 self.report_retry(count, retries)
908
909 if count > retries:
910 self.trouble(u'ERROR: giving up after %s retries' % retries)
911 return False
912
913 data_len = data.info().get('Content-length', None)
914 if data_len is not None:
915 data_len = long(data_len) + resume_len
916 data_len_str = self.format_bytes(data_len)
917 byte_counter = 0 + resume_len
918 block_size = 1024
919 start = time.time()
920 while True:
921 # Download and write
922 before = time.time()
923 data_block = data.read(block_size)
924 after = time.time()
925 if len(data_block) == 0:
926 break
927 byte_counter += len(data_block)
928
929 # Open file just in time
930 if stream is None:
931 try:
932 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
933 assert stream is not None
934 filename = self.undo_temp_name(tmpfilename)
935 self.report_destination(filename)
936 except (OSError, IOError), err:
937 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
938 return False
939 try:
940 stream.write(data_block)
941 except (IOError, OSError), err:
942 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
943 return False
944 block_size = self.best_block_size(after - before, len(data_block))
945
946 # Progress message
947 percent_str = self.calc_percent(byte_counter, data_len)
948 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
949 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
950 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
951
952 # Apply rate limit
953 self.slow_down(start, byte_counter - resume_len)
954
955 if stream is None:
956 self.trouble(u'\nERROR: Did not get any data blocks')
957 return False
958 stream.close()
959 self.report_finish()
960 if data_len is not None and byte_counter != data_len:
961 raise ContentTooShortError(byte_counter, long(data_len))
962 self.try_rename(tmpfilename, filename)
963
964 # Update file modification time
965 if self.params.get('updatetime', True):
966 self.try_utime(filename, data.info().get('last-modified', None))
967
968 return True
969
970 class InfoExtractor(object):
971 """Information Extractor class.
972
973 Information extractors are the classes that, given a URL, extract
974 information from the video (or videos) the URL refers to. This
975 information includes the real video URL, the video title and simplified
976 title, author and others. The information is stored in a dictionary
977 which is then passed to the FileDownloader. The FileDownloader
978 processes this information possibly downloading the video to the file
979 system, among other possible outcomes. The dictionaries must include
980 the following fields:
981
982 id: Video identifier.
983 url: Final video URL.
984 uploader: Nickname of the video uploader.
985 title: Literal title.
986 stitle: Simplified title.
987 ext: Video filename extension.
988 format: Video format.
989 player_url: SWF Player URL (may be None).
990
991 The following fields are optional. Their primary purpose is to allow
992 youtube-dl to serve as the backend for a video search function, such
993 as the one in youtube2mp3. They are only used when their respective
994 forced printing functions are called:
995
996 thumbnail: Full URL to a video thumbnail image.
997 description: One-line video description.
998
999 Subclasses of this one should re-define the _real_initialize() and
1000 _real_extract() methods, as well as the suitable() static method.
1001 Probably, they should also be instantiated and added to the main
1002 downloader.
1003 """
1004
1005 _ready = False
1006 _downloader = None
1007
1008 def __init__(self, downloader=None):
1009 """Constructor. Receives an optional downloader."""
1010 self._ready = False
1011 self.set_downloader(downloader)
1012
1013 @staticmethod
1014 def suitable(url):
1015 """Receives a URL and returns True if suitable for this IE."""
1016 return False
1017
1018 def initialize(self):
1019 """Initializes an instance (authentication, etc)."""
1020 if not self._ready:
1021 self._real_initialize()
1022 self._ready = True
1023
1024 def extract(self, url):
1025 """Extracts URL information and returns it in list of dicts."""
1026 self.initialize()
1027 return self._real_extract(url)
1028
1029 def set_downloader(self, downloader):
1030 """Sets the downloader for this IE."""
1031 self._downloader = downloader
1032
1033 def _real_initialize(self):
1034 """Real initialization process. Redefine in subclasses."""
1035 pass
1036
1037 def _real_extract(self, url):
1038 """Real extraction process. Redefine in subclasses."""
1039 pass
1040
1041 class YoutubeIE(InfoExtractor):
1042 """Information extractor for youtube.com."""
1043
1044 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1045 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1046 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1047 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1048 _NETRC_MACHINE = 'youtube'
1049 # Listed in order of quality
1050 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1051 _video_extensions = {
1052 '13': '3gp',
1053 '17': 'mp4',
1054 '18': 'mp4',
1055 '22': 'mp4',
1056 '37': 'mp4',
1057 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1058 '43': 'webm',
1059 '45': 'webm',
1060 }
1061
1062 @staticmethod
1063 def suitable(url):
1064 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1065
1066 def report_lang(self):
1067 """Report attempt to set language."""
1068 self._downloader.to_screen(u'[youtube] Setting language')
1069
1070 def report_login(self):
1071 """Report attempt to log in."""
1072 self._downloader.to_screen(u'[youtube] Logging in')
1073
1074 def report_age_confirmation(self):
1075 """Report attempt to confirm age."""
1076 self._downloader.to_screen(u'[youtube] Confirming age')
1077
1078 def report_video_webpage_download(self, video_id):
1079 """Report attempt to download video webpage."""
1080 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1081
1082 def report_video_info_webpage_download(self, video_id):
1083 """Report attempt to download video info webpage."""
1084 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1085
1086 def report_information_extraction(self, video_id):
1087 """Report attempt to extract video information."""
1088 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1089
1090 def report_unavailable_format(self, video_id, format):
1091 """Report extracted video URL."""
1092 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1093
1094 def report_rtmp_download(self):
1095 """Indicate the download will use the RTMP protocol."""
1096 self._downloader.to_screen(u'[youtube] RTMP download detected')
1097
1098 def _real_initialize(self):
1099 if self._downloader is None:
1100 return
1101
1102 username = None
1103 password = None
1104 downloader_params = self._downloader.params
1105
1106 # Attempt to use provided username and password or .netrc data
1107 if downloader_params.get('username', None) is not None:
1108 username = downloader_params['username']
1109 password = downloader_params['password']
1110 elif downloader_params.get('usenetrc', False):
1111 try:
1112 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1113 if info is not None:
1114 username = info[0]
1115 password = info[2]
1116 else:
1117 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1118 except (IOError, netrc.NetrcParseError), err:
1119 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1120 return
1121
1122 # Set language
1123 request = urllib2.Request(self._LANG_URL)
1124 try:
1125 self.report_lang()
1126 urllib2.urlopen(request).read()
1127 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1128 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1129 return
1130
1131 # No authentication to be performed
1132 if username is None:
1133 return
1134
1135 # Log in
1136 login_form = {
1137 'current_form': 'loginForm',
1138 'next': '/',
1139 'action_login': 'Log In',
1140 'username': username,
1141 'password': password,
1142 }
1143 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1144 try:
1145 self.report_login()
1146 login_results = urllib2.urlopen(request).read()
1147 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1148 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1149 return
1150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1152 return
1153
1154 # Confirm age
1155 age_form = {
1156 'next_url': '/',
1157 'action_confirm': 'Confirm',
1158 }
1159 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1160 try:
1161 self.report_age_confirmation()
1162 age_results = urllib2.urlopen(request).read()
1163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1164 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1165 return
1166
1167 def _real_extract(self, url):
1168 # Extract video id from URL
1169 mobj = re.match(self._VALID_URL, url)
1170 if mobj is None:
1171 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1172 return
1173 video_id = mobj.group(2)
1174
1175 # Get video webpage
1176 self.report_video_webpage_download(video_id)
1177 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1178 try:
1179 video_webpage = urllib2.urlopen(request).read()
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1182 return
1183
1184 # Attempt to extract SWF player URL
1185 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1186 if mobj is not None:
1187 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1188 else:
1189 player_url = None
1190
1191 # Get video info
1192 self.report_video_info_webpage_download(video_id)
1193 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1194 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1195 % (video_id, el_type))
1196 request = urllib2.Request(video_info_url)
1197 try:
1198 video_info_webpage = urllib2.urlopen(request).read()
1199 video_info = parse_qs(video_info_webpage)
1200 if 'token' in video_info:
1201 break
1202 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1204 return
1205 if 'token' not in video_info:
1206 if 'reason' in video_info:
1207 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1208 else:
1209 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1210 return
1211
1212 # Start extracting information
1213 self.report_information_extraction(video_id)
1214
1215 # uploader
1216 if 'author' not in video_info:
1217 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1218 return
1219 video_uploader = urllib.unquote_plus(video_info['author'][0])
1220
1221 # title
1222 if 'title' not in video_info:
1223 self._downloader.trouble(u'ERROR: unable to extract video title')
1224 return
1225 video_title = urllib.unquote_plus(video_info['title'][0])
1226 video_title = video_title.decode('utf-8')
1227 video_title = sanitize_title(video_title)
1228
1229 # simplified title
1230 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1231 simple_title = simple_title.strip(ur'_')
1232
1233 # thumbnail image
1234 if 'thumbnail_url' not in video_info:
1235 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1236 video_thumbnail = ''
1237 else: # don't panic if we can't find it
1238 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1239
1240 # upload date
1241 upload_date = u'NA'
1242 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1243 if mobj is not None:
1244 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1245 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1246 for expression in format_expressions:
1247 try:
1248 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1249 except:
1250 pass
1251
1252 # description
1253 try:
1254 lxml.etree
1255 except NameError:
1256 video_description = u'No description available.'
1257 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1258 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1259 if mobj is not None:
1260 video_description = mobj.group(1).decode('utf-8')
1261 else:
1262 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1263 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1264 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1265 # TODO use another parser
1266
1267 # token
1268 video_token = urllib.unquote_plus(video_info['token'][0])
1269
1270 # Decide which formats to download
1271 req_format = self._downloader.params.get('format', None)
1272
1273 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1274 self.report_rtmp_download()
1275 video_url_list = [(None, video_info['conn'][0])]
1276 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1277 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1278 url_data = [parse_qs(uds) for uds in url_data_strs]
1279 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1280 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1281
1282 format_limit = self._downloader.params.get('format_limit', None)
1283 if format_limit is not None and format_limit in self._available_formats:
1284 format_list = self._available_formats[self._available_formats.index(format_limit):]
1285 else:
1286 format_list = self._available_formats
1287 existing_formats = [x for x in format_list if x in url_map]
1288 if len(existing_formats) == 0:
1289 self._downloader.trouble(u'ERROR: no known formats available for video')
1290 return
1291 if req_format is None:
1292 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1293 elif req_format == '-1':
1294 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1295 else:
1296 # Specific format
1297 if req_format not in url_map:
1298 self._downloader.trouble(u'ERROR: requested format not available')
1299 return
1300 video_url_list = [(req_format, url_map[req_format])] # Specific format
1301 else:
1302 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1303 return
1304
1305 for format_param, video_real_url in video_url_list:
1306 # At this point we have a new video
1307 self._downloader.increment_downloads()
1308
1309 # Extension
1310 video_extension = self._video_extensions.get(format_param, 'flv')
1311
1312 try:
1313 # Process video information
1314 self._downloader.process_info({
1315 'id': video_id.decode('utf-8'),
1316 'url': video_real_url.decode('utf-8'),
1317 'uploader': video_uploader.decode('utf-8'),
1318 'upload_date': upload_date,
1319 'title': video_title,
1320 'stitle': simple_title,
1321 'ext': video_extension.decode('utf-8'),
1322 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1323 'thumbnail': video_thumbnail.decode('utf-8'),
1324 'description': video_description,
1325 'player_url': player_url,
1326 })
1327 except UnavailableVideoError, err:
1328 self._downloader.trouble(u'\nERROR: unable to download video')
1329
1330
1331 class MetacafeIE(InfoExtractor):
1332 """Information Extractor for metacafe.com."""
1333
1334 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1335 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1336 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1337 _youtube_ie = None
1338
1339 def __init__(self, youtube_ie, downloader=None):
1340 InfoExtractor.__init__(self, downloader)
1341 self._youtube_ie = youtube_ie
1342
1343 @staticmethod
1344 def suitable(url):
1345 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1346
1347 def report_disclaimer(self):
1348 """Report disclaimer retrieval."""
1349 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1350
1351 def report_age_confirmation(self):
1352 """Report attempt to confirm age."""
1353 self._downloader.to_screen(u'[metacafe] Confirming age')
1354
1355 def report_download_webpage(self, video_id):
1356 """Report webpage download."""
1357 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1358
1359 def report_extraction(self, video_id):
1360 """Report information extraction."""
1361 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1362
1363 def _real_initialize(self):
1364 # Retrieve disclaimer
1365 request = urllib2.Request(self._DISCLAIMER)
1366 try:
1367 self.report_disclaimer()
1368 disclaimer = urllib2.urlopen(request).read()
1369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1370 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1371 return
1372
1373 # Confirm age
1374 disclaimer_form = {
1375 'filters': '0',
1376 'submit': "Continue - I'm over 18",
1377 }
1378 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1379 try:
1380 self.report_age_confirmation()
1381 disclaimer = urllib2.urlopen(request).read()
1382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1384 return
1385
1386 def _real_extract(self, url):
1387 # Extract id and simplified title from URL
1388 mobj = re.match(self._VALID_URL, url)
1389 if mobj is None:
1390 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1391 return
1392
1393 video_id = mobj.group(1)
1394
1395 # Check if video comes from YouTube
1396 mobj2 = re.match(r'^yt-(.*)$', video_id)
1397 if mobj2 is not None:
1398 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1399 return
1400
1401 # At this point we have a new video
1402 self._downloader.increment_downloads()
1403
1404 simple_title = mobj.group(2).decode('utf-8')
1405
1406 # Retrieve video webpage to extract further information
1407 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1408 try:
1409 self.report_download_webpage(video_id)
1410 webpage = urllib2.urlopen(request).read()
1411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1413 return
1414
1415 # Extract URL, uploader and title from webpage
1416 self.report_extraction(video_id)
1417 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1418 if mobj is not None:
1419 mediaURL = urllib.unquote(mobj.group(1))
1420 video_extension = mediaURL[-3:]
1421
1422 # Extract gdaKey if available
1423 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1424 if mobj is None:
1425 video_url = mediaURL
1426 else:
1427 gdaKey = mobj.group(1)
1428 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1429 else:
1430 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1431 if mobj is None:
1432 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433 return
1434 vardict = parse_qs(mobj.group(1))
1435 if 'mediaData' not in vardict:
1436 self._downloader.trouble(u'ERROR: unable to extract media URL')
1437 return
1438 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1439 if mobj is None:
1440 self._downloader.trouble(u'ERROR: unable to extract media URL')
1441 return
1442 mediaURL = mobj.group(1).replace('\\/', '/')
1443 video_extension = mediaURL[-3:]
1444 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1445
1446 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1447 if mobj is None:
1448 self._downloader.trouble(u'ERROR: unable to extract title')
1449 return
1450 video_title = mobj.group(1).decode('utf-8')
1451 video_title = sanitize_title(video_title)
1452
1453 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1454 if mobj is None:
1455 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1456 return
1457 video_uploader = mobj.group(1)
1458
1459 try:
1460 # Process video information
1461 self._downloader.process_info({
1462 'id': video_id.decode('utf-8'),
1463 'url': video_url.decode('utf-8'),
1464 'uploader': video_uploader.decode('utf-8'),
1465 'upload_date': u'NA',
1466 'title': video_title,
1467 'stitle': simple_title,
1468 'ext': video_extension.decode('utf-8'),
1469 'format': u'NA',
1470 'player_url': None,
1471 })
1472 except UnavailableVideoError:
1473 self._downloader.trouble(u'\nERROR: unable to download video')
1474
1475
1476 class DailymotionIE(InfoExtractor):
1477 """Information Extractor for Dailymotion"""
1478
1479 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1480
1481 def __init__(self, downloader=None):
1482 InfoExtractor.__init__(self, downloader)
1483
1484 @staticmethod
1485 def suitable(url):
1486 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1487
1488 def report_download_webpage(self, video_id):
1489 """Report webpage download."""
1490 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1491
1492 def report_extraction(self, video_id):
1493 """Report information extraction."""
1494 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1495
1496 def _real_initialize(self):
1497 return
1498
1499 def _real_extract(self, url):
1500 # Extract id and simplified title from URL
1501 mobj = re.match(self._VALID_URL, url)
1502 if mobj is None:
1503 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1504 return
1505
1506 # At this point we have a new video
1507 self._downloader.increment_downloads()
1508 video_id = mobj.group(1)
1509
1510 simple_title = mobj.group(2).decode('utf-8')
1511 video_extension = 'flv'
1512
1513 # Retrieve video webpage to extract further information
1514 request = urllib2.Request(url)
1515 try:
1516 self.report_download_webpage(video_id)
1517 webpage = urllib2.urlopen(request).read()
1518 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1520 return
1521
1522 # Extract URL, uploader and title from webpage
1523 self.report_extraction(video_id)
1524 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1525 if mobj is None:
1526 self._downloader.trouble(u'ERROR: unable to extract media URL')
1527 return
1528 mediaURL = urllib.unquote(mobj.group(1))
1529
1530 # if needed add http://www.dailymotion.com/ if relative URL
1531
1532 video_url = mediaURL
1533
1534 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1535 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1536 if mobj is None:
1537 self._downloader.trouble(u'ERROR: unable to extract title')
1538 return
1539 video_title = mobj.group(1).decode('utf-8')
1540 video_title = sanitize_title(video_title)
1541
1542 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1543 if mobj is None:
1544 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1545 return
1546 video_uploader = mobj.group(1)
1547
1548 try:
1549 # Process video information
1550 self._downloader.process_info({
1551 'id': video_id.decode('utf-8'),
1552 'url': video_url.decode('utf-8'),
1553 'uploader': video_uploader.decode('utf-8'),
1554 'upload_date': u'NA',
1555 'title': video_title,
1556 'stitle': simple_title,
1557 'ext': video_extension.decode('utf-8'),
1558 'format': u'NA',
1559 'player_url': None,
1560 })
1561 except UnavailableVideoError:
1562 self._downloader.trouble(u'\nERROR: unable to download video')
1563
1564 class GoogleIE(InfoExtractor):
1565 """Information extractor for video.google.com."""
1566
1567 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1568
1569 def __init__(self, downloader=None):
1570 InfoExtractor.__init__(self, downloader)
1571
1572 @staticmethod
1573 def suitable(url):
1574 return (re.match(GoogleIE._VALID_URL, url) is not None)
1575
1576 def report_download_webpage(self, video_id):
1577 """Report webpage download."""
1578 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1579
1580 def report_extraction(self, video_id):
1581 """Report information extraction."""
1582 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1583
1584 def _real_initialize(self):
1585 return
1586
1587 def _real_extract(self, url):
1588 # Extract id from URL
1589 mobj = re.match(self._VALID_URL, url)
1590 if mobj is None:
1591 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1592 return
1593
1594 # At this point we have a new video
1595 self._downloader.increment_downloads()
1596 video_id = mobj.group(1)
1597
1598 video_extension = 'mp4'
1599
1600 # Retrieve video webpage to extract further information
1601 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1602 try:
1603 self.report_download_webpage(video_id)
1604 webpage = urllib2.urlopen(request).read()
1605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1606 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1607 return
1608
1609 # Extract URL, uploader, and title from webpage
1610 self.report_extraction(video_id)
1611 mobj = re.search(r"download_url:'([^']+)'", webpage)
1612 if mobj is None:
1613 video_extension = 'flv'
1614 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1615 if mobj is None:
1616 self._downloader.trouble(u'ERROR: unable to extract media URL')
1617 return
1618 mediaURL = urllib.unquote(mobj.group(1))
1619 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1620 mediaURL = mediaURL.replace('\\x26', '\x26')
1621
1622 video_url = mediaURL
1623
1624 mobj = re.search(r'<title>(.*)</title>', webpage)
1625 if mobj is None:
1626 self._downloader.trouble(u'ERROR: unable to extract title')
1627 return
1628 video_title = mobj.group(1).decode('utf-8')
1629 video_title = sanitize_title(video_title)
1630 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1631
1632 # Extract video description
1633 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1634 if mobj is None:
1635 self._downloader.trouble(u'ERROR: unable to extract video description')
1636 return
1637 video_description = mobj.group(1).decode('utf-8')
1638 if not video_description:
1639 video_description = 'No description available.'
1640
1641 # Extract video thumbnail
1642 if self._downloader.params.get('forcethumbnail', False):
1643 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1644 try:
1645 webpage = urllib2.urlopen(request).read()
1646 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1648 return
1649 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1650 if mobj is None:
1651 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1652 return
1653 video_thumbnail = mobj.group(1)
1654 else: # we need something to pass to process_info
1655 video_thumbnail = ''
1656
1657
1658 try:
1659 # Process video information
1660 self._downloader.process_info({
1661 'id': video_id.decode('utf-8'),
1662 'url': video_url.decode('utf-8'),
1663 'uploader': u'NA',
1664 'upload_date': u'NA',
1665 'title': video_title,
1666 'stitle': simple_title,
1667 'ext': video_extension.decode('utf-8'),
1668 'format': u'NA',
1669 'player_url': None,
1670 })
1671 except UnavailableVideoError:
1672 self._downloader.trouble(u'\nERROR: unable to download video')
1673
1674
1675 class PhotobucketIE(InfoExtractor):
1676 """Information extractor for photobucket.com."""
1677
1678 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1679
1680 def __init__(self, downloader=None):
1681 InfoExtractor.__init__(self, downloader)
1682
1683 @staticmethod
1684 def suitable(url):
1685 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1686
1687 def report_download_webpage(self, video_id):
1688 """Report webpage download."""
1689 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1690
1691 def report_extraction(self, video_id):
1692 """Report information extraction."""
1693 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1694
1695 def _real_initialize(self):
1696 return
1697
1698 def _real_extract(self, url):
1699 # Extract id from URL
1700 mobj = re.match(self._VALID_URL, url)
1701 if mobj is None:
1702 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1703 return
1704
1705 # At this point we have a new video
1706 self._downloader.increment_downloads()
1707 video_id = mobj.group(1)
1708
1709 video_extension = 'flv'
1710
1711 # Retrieve video webpage to extract further information
1712 request = urllib2.Request(url)
1713 try:
1714 self.report_download_webpage(video_id)
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718 return
1719
1720 # Extract URL, uploader, and title from webpage
1721 self.report_extraction(video_id)
1722 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1723 if mobj is None:
1724 self._downloader.trouble(u'ERROR: unable to extract media URL')
1725 return
1726 mediaURL = urllib.unquote(mobj.group(1))
1727
1728 video_url = mediaURL
1729
1730 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1731 if mobj is None:
1732 self._downloader.trouble(u'ERROR: unable to extract title')
1733 return
1734 video_title = mobj.group(1).decode('utf-8')
1735 video_title = sanitize_title(video_title)
1736 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1737
1738 video_uploader = mobj.group(2).decode('utf-8')
1739
1740 try:
1741 # Process video information
1742 self._downloader.process_info({
1743 'id': video_id.decode('utf-8'),
1744 'url': video_url.decode('utf-8'),
1745 'uploader': video_uploader,
1746 'upload_date': u'NA',
1747 'title': video_title,
1748 'stitle': simple_title,
1749 'ext': video_extension.decode('utf-8'),
1750 'format': u'NA',
1751 'player_url': None,
1752 })
1753 except UnavailableVideoError:
1754 self._downloader.trouble(u'\nERROR: unable to download video')
1755
1756
1757 class YahooIE(InfoExtractor):
1758 """Information extractor for video.yahoo.com."""
1759
1760 # _VALID_URL matches all Yahoo! Video URLs
1761 # _VPAGE_URL matches only the extractable '/watch/' URLs
1762 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1763 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1764
1765 def __init__(self, downloader=None):
1766 InfoExtractor.__init__(self, downloader)
1767
1768 @staticmethod
1769 def suitable(url):
1770 return (re.match(YahooIE._VALID_URL, url) is not None)
1771
1772 def report_download_webpage(self, video_id):
1773 """Report webpage download."""
1774 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1775
1776 def report_extraction(self, video_id):
1777 """Report information extraction."""
1778 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1779
1780 def _real_initialize(self):
1781 return
1782
1783 def _real_extract(self, url, new_video=True):
1784 # Extract ID from URL
1785 mobj = re.match(self._VALID_URL, url)
1786 if mobj is None:
1787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1788 return
1789
1790 # At this point we have a new video
1791 self._downloader.increment_downloads()
1792 video_id = mobj.group(2)
1793 video_extension = 'flv'
1794
1795 # Rewrite valid but non-extractable URLs as
1796 # extractable English language /watch/ URLs
1797 if re.match(self._VPAGE_URL, url) is None:
1798 request = urllib2.Request(url)
1799 try:
1800 webpage = urllib2.urlopen(request).read()
1801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1803 return
1804
1805 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1806 if mobj is None:
1807 self._downloader.trouble(u'ERROR: Unable to extract id field')
1808 return
1809 yahoo_id = mobj.group(1)
1810
1811 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1812 if mobj is None:
1813 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1814 return
1815 yahoo_vid = mobj.group(1)
1816
1817 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1818 return self._real_extract(url, new_video=False)
1819
1820 # Retrieve video webpage to extract further information
1821 request = urllib2.Request(url)
1822 try:
1823 self.report_download_webpage(video_id)
1824 webpage = urllib2.urlopen(request).read()
1825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1827 return
1828
1829 # Extract uploader and title from webpage
1830 self.report_extraction(video_id)
1831 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1832 if mobj is None:
1833 self._downloader.trouble(u'ERROR: unable to extract video title')
1834 return
1835 video_title = mobj.group(1).decode('utf-8')
1836 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1837
1838 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1839 if mobj is None:
1840 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1841 return
1842 video_uploader = mobj.group(1).decode('utf-8')
1843
1844 # Extract video thumbnail
1845 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1846 if mobj is None:
1847 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1848 return
1849 video_thumbnail = mobj.group(1).decode('utf-8')
1850
1851 # Extract video description
1852 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1853 if mobj is None:
1854 self._downloader.trouble(u'ERROR: unable to extract video description')
1855 return
1856 video_description = mobj.group(1).decode('utf-8')
1857 if not video_description: video_description = 'No description available.'
1858
1859 # Extract video height and width
1860 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1861 if mobj is None:
1862 self._downloader.trouble(u'ERROR: unable to extract video height')
1863 return
1864 yv_video_height = mobj.group(1)
1865
1866 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1867 if mobj is None:
1868 self._downloader.trouble(u'ERROR: unable to extract video width')
1869 return
1870 yv_video_width = mobj.group(1)
1871
1872 # Retrieve video playlist to extract media URL
1873 # I'm not completely sure what all these options are, but we
1874 # seem to need most of them, otherwise the server sends a 401.
1875 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1876 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1877 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1878 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1879 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1880 try:
1881 self.report_download_webpage(video_id)
1882 webpage = urllib2.urlopen(request).read()
1883 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1885 return
1886
1887 # Extract media URL from playlist XML
1888 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1889 if mobj is None:
1890 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1891 return
1892 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1893 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1894
1895 try:
1896 # Process video information
1897 self._downloader.process_info({
1898 'id': video_id.decode('utf-8'),
1899 'url': video_url,
1900 'uploader': video_uploader,
1901 'upload_date': u'NA',
1902 'title': video_title,
1903 'stitle': simple_title,
1904 'ext': video_extension.decode('utf-8'),
1905 'thumbnail': video_thumbnail.decode('utf-8'),
1906 'description': video_description,
1907 'thumbnail': video_thumbnail,
1908 'description': video_description,
1909 'player_url': None,
1910 })
1911 except UnavailableVideoError:
1912 self._downloader.trouble(u'\nERROR: unable to download video')
1913
1914
1915 class VimeoIE(InfoExtractor):
1916 """Information extractor for vimeo.com."""
1917
1918 # _VALID_URL matches Vimeo URLs
1919 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1920
1921 def __init__(self, downloader=None):
1922 InfoExtractor.__init__(self, downloader)
1923
1924 @staticmethod
1925 def suitable(url):
1926 return (re.match(VimeoIE._VALID_URL, url) is not None)
1927
1928 def report_download_webpage(self, video_id):
1929 """Report webpage download."""
1930 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1931
1932 def report_extraction(self, video_id):
1933 """Report information extraction."""
1934 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1935
1936 def _real_initialize(self):
1937 return
1938
1939 def _real_extract(self, url, new_video=True):
1940 # Extract ID from URL
1941 mobj = re.match(self._VALID_URL, url)
1942 if mobj is None:
1943 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1944 return
1945
1946 # At this point we have a new video
1947 self._downloader.increment_downloads()
1948 video_id = mobj.group(1)
1949
1950 # Retrieve video webpage to extract further information
1951 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1952 try:
1953 self.report_download_webpage(video_id)
1954 webpage = urllib2.urlopen(request).read()
1955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1956 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1957 return
1958
1959 # Now we begin extracting as much information as we can from what we
1960 # retrieved. First we extract the information common to all extractors,
1961 # and latter we extract those that are Vimeo specific.
1962 self.report_extraction(video_id)
1963
1964 # Extract title
1965 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1966 if mobj is None:
1967 self._downloader.trouble(u'ERROR: unable to extract video title')
1968 return
1969 video_title = mobj.group(1).decode('utf-8')
1970 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1971
1972 # Extract uploader
1973 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1974 if mobj is None:
1975 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1976 return
1977 video_uploader = mobj.group(1).decode('utf-8')
1978
1979 # Extract video thumbnail
1980 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1981 if mobj is None:
1982 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1983 return
1984 video_thumbnail = mobj.group(1).decode('utf-8')
1985
1986 # # Extract video description
1987 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1988 # if mobj is None:
1989 # self._downloader.trouble(u'ERROR: unable to extract video description')
1990 # return
1991 # video_description = mobj.group(1).decode('utf-8')
1992 # if not video_description: video_description = 'No description available.'
1993 video_description = 'Foo.'
1994
1995 # Vimeo specific: extract request signature
1996 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1997 if mobj is None:
1998 self._downloader.trouble(u'ERROR: unable to extract request signature')
1999 return
2000 sig = mobj.group(1).decode('utf-8')
2001
2002 # Vimeo specific: Extract request signature expiration
2003 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2004 if mobj is None:
2005 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2006 return
2007 sig_exp = mobj.group(1).decode('utf-8')
2008
2009 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2010
2011 try:
2012 # Process video information
2013 self._downloader.process_info({
2014 'id': video_id.decode('utf-8'),
2015 'url': video_url,
2016 'uploader': video_uploader,
2017 'upload_date': u'NA',
2018 'title': video_title,
2019 'stitle': simple_title,
2020 'ext': u'mp4',
2021 'thumbnail': video_thumbnail.decode('utf-8'),
2022 'description': video_description,
2023 'thumbnail': video_thumbnail,
2024 'description': video_description,
2025 'player_url': None,
2026 })
2027 except UnavailableVideoError:
2028 self._downloader.trouble(u'ERROR: unable to download video')
2029
2030
2031 class GenericIE(InfoExtractor):
2032 """Generic last-resort information extractor."""
2033
2034 def __init__(self, downloader=None):
2035 InfoExtractor.__init__(self, downloader)
2036
2037 @staticmethod
2038 def suitable(url):
2039 return True
2040
2041 def report_download_webpage(self, video_id):
2042 """Report webpage download."""
2043 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2044 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2045
2046 def report_extraction(self, video_id):
2047 """Report information extraction."""
2048 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2049
2050 def _real_initialize(self):
2051 return
2052
2053 def _real_extract(self, url):
2054 # At this point we have a new video
2055 self._downloader.increment_downloads()
2056
2057 video_id = url.split('/')[-1]
2058 request = urllib2.Request(url)
2059 try:
2060 self.report_download_webpage(video_id)
2061 webpage = urllib2.urlopen(request).read()
2062 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2063 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2064 return
2065 except ValueError, err:
2066 # since this is the last-resort InfoExtractor, if
2067 # this error is thrown, it'll be thrown here
2068 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2069 return
2070
2071 self.report_extraction(video_id)
2072 # Start with something easy: JW Player in SWFObject
2073 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2074 if mobj is None:
2075 # Broaden the search a little bit
2076 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2077 if mobj is None:
2078 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2079 return
2080
2081 # It's possible that one of the regexes
2082 # matched, but returned an empty group:
2083 if mobj.group(1) is None:
2084 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2085 return
2086
2087 video_url = urllib.unquote(mobj.group(1))
2088 video_id = os.path.basename(video_url)
2089
2090 # here's a fun little line of code for you:
2091 video_extension = os.path.splitext(video_id)[1][1:]
2092 video_id = os.path.splitext(video_id)[0]
2093
2094 # it's tempting to parse this further, but you would
2095 # have to take into account all the variations like
2096 # Video Title - Site Name
2097 # Site Name | Video Title
2098 # Video Title - Tagline | Site Name
2099 # and so on and so forth; it's just not practical
2100 mobj = re.search(r'<title>(.*)</title>', webpage)
2101 if mobj is None:
2102 self._downloader.trouble(u'ERROR: unable to extract title')
2103 return
2104 video_title = mobj.group(1).decode('utf-8')
2105 video_title = sanitize_title(video_title)
2106 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2107
2108 # video uploader is domain name
2109 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2110 if mobj is None:
2111 self._downloader.trouble(u'ERROR: unable to extract title')
2112 return
2113 video_uploader = mobj.group(1).decode('utf-8')
2114
2115 try:
2116 # Process video information
2117 self._downloader.process_info({
2118 'id': video_id.decode('utf-8'),
2119 'url': video_url.decode('utf-8'),
2120 'uploader': video_uploader,
2121 'upload_date': u'NA',
2122 'title': video_title,
2123 'stitle': simple_title,
2124 'ext': video_extension.decode('utf-8'),
2125 'format': u'NA',
2126 'player_url': None,
2127 })
2128 except UnavailableVideoError, err:
2129 self._downloader.trouble(u'\nERROR: unable to download video')
2130
2131
2132 class YoutubeSearchIE(InfoExtractor):
2133 """Information Extractor for YouTube search queries."""
2134 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2135 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2136 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2137 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2138 _youtube_ie = None
2139 _max_youtube_results = 1000
2140
2141 def __init__(self, youtube_ie, downloader=None):
2142 InfoExtractor.__init__(self, downloader)
2143 self._youtube_ie = youtube_ie
2144
2145 @staticmethod
2146 def suitable(url):
2147 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2148
2149 def report_download_page(self, query, pagenum):
2150 """Report attempt to download playlist page with given number."""
2151 query = query.decode(preferredencoding())
2152 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2153
2154 def _real_initialize(self):
2155 self._youtube_ie.initialize()
2156
2157 def _real_extract(self, query):
2158 mobj = re.match(self._VALID_QUERY, query)
2159 if mobj is None:
2160 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2161 return
2162
2163 prefix, query = query.split(':')
2164 prefix = prefix[8:]
2165 query = query.encode('utf-8')
2166 if prefix == '':
2167 self._download_n_results(query, 1)
2168 return
2169 elif prefix == 'all':
2170 self._download_n_results(query, self._max_youtube_results)
2171 return
2172 else:
2173 try:
2174 n = long(prefix)
2175 if n <= 0:
2176 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2177 return
2178 elif n > self._max_youtube_results:
2179 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2180 n = self._max_youtube_results
2181 self._download_n_results(query, n)
2182 return
2183 except ValueError: # parsing prefix as integer fails
2184 self._download_n_results(query, 1)
2185 return
2186
2187 def _download_n_results(self, query, n):
2188 """Downloads a specified number of results for a query"""
2189
2190 video_ids = []
2191 already_seen = set()
2192 pagenum = 1
2193
2194 while True:
2195 self.report_download_page(query, pagenum)
2196 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2197 request = urllib2.Request(result_url)
2198 try:
2199 page = urllib2.urlopen(request).read()
2200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2201 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2202 return
2203
2204 # Extract video identifiers
2205 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2206 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2207 if video_id not in already_seen:
2208 video_ids.append(video_id)
2209 already_seen.add(video_id)
2210 if len(video_ids) == n:
2211 # Specified n videos reached
2212 for id in video_ids:
2213 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2214 return
2215
2216 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2217 for id in video_ids:
2218 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2219 return
2220
2221 pagenum = pagenum + 1
2222
2223 class GoogleSearchIE(InfoExtractor):
2224 """Information Extractor for Google Video search queries."""
2225 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2226 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2227 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2228 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2229 _google_ie = None
2230 _max_google_results = 1000
2231
2232 def __init__(self, google_ie, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2234 self._google_ie = google_ie
2235
2236 @staticmethod
2237 def suitable(url):
2238 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2239
2240 def report_download_page(self, query, pagenum):
2241 """Report attempt to download playlist page with given number."""
2242 query = query.decode(preferredencoding())
2243 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2244
2245 def _real_initialize(self):
2246 self._google_ie.initialize()
2247
2248 def _real_extract(self, query):
2249 mobj = re.match(self._VALID_QUERY, query)
2250 if mobj is None:
2251 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2252 return
2253
2254 prefix, query = query.split(':')
2255 prefix = prefix[8:]
2256 query = query.encode('utf-8')
2257 if prefix == '':
2258 self._download_n_results(query, 1)
2259 return
2260 elif prefix == 'all':
2261 self._download_n_results(query, self._max_google_results)
2262 return
2263 else:
2264 try:
2265 n = long(prefix)
2266 if n <= 0:
2267 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2268 return
2269 elif n > self._max_google_results:
2270 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2271 n = self._max_google_results
2272 self._download_n_results(query, n)
2273 return
2274 except ValueError: # parsing prefix as integer fails
2275 self._download_n_results(query, 1)
2276 return
2277
2278 def _download_n_results(self, query, n):
2279 """Downloads a specified number of results for a query"""
2280
2281 video_ids = []
2282 already_seen = set()
2283 pagenum = 1
2284
2285 while True:
2286 self.report_download_page(query, pagenum)
2287 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2288 request = urllib2.Request(result_url)
2289 try:
2290 page = urllib2.urlopen(request).read()
2291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2292 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2293 return
2294
2295 # Extract video identifiers
2296 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2297 video_id = mobj.group(1)
2298 if video_id not in already_seen:
2299 video_ids.append(video_id)
2300 already_seen.add(video_id)
2301 if len(video_ids) == n:
2302 # Specified n videos reached
2303 for id in video_ids:
2304 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2305 return
2306
2307 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2308 for id in video_ids:
2309 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2310 return
2311
2312 pagenum = pagenum + 1
2313
2314 class YahooSearchIE(InfoExtractor):
2315 """Information Extractor for Yahoo! Video search queries."""
2316 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2317 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2318 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2319 _MORE_PAGES_INDICATOR = r'\s*Next'
2320 _yahoo_ie = None
2321 _max_yahoo_results = 1000
2322
2323 def __init__(self, yahoo_ie, downloader=None):
2324 InfoExtractor.__init__(self, downloader)
2325 self._yahoo_ie = yahoo_ie
2326
2327 @staticmethod
2328 def suitable(url):
2329 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2330
2331 def report_download_page(self, query, pagenum):
2332 """Report attempt to download playlist page with given number."""
2333 query = query.decode(preferredencoding())
2334 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2335
2336 def _real_initialize(self):
2337 self._yahoo_ie.initialize()
2338
2339 def _real_extract(self, query):
2340 mobj = re.match(self._VALID_QUERY, query)
2341 if mobj is None:
2342 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2343 return
2344
2345 prefix, query = query.split(':')
2346 prefix = prefix[8:]
2347 query = query.encode('utf-8')
2348 if prefix == '':
2349 self._download_n_results(query, 1)
2350 return
2351 elif prefix == 'all':
2352 self._download_n_results(query, self._max_yahoo_results)
2353 return
2354 else:
2355 try:
2356 n = long(prefix)
2357 if n <= 0:
2358 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2359 return
2360 elif n > self._max_yahoo_results:
2361 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2362 n = self._max_yahoo_results
2363 self._download_n_results(query, n)
2364 return
2365 except ValueError: # parsing prefix as integer fails
2366 self._download_n_results(query, 1)
2367 return
2368
2369 def _download_n_results(self, query, n):
2370 """Downloads a specified number of results for a query"""
2371
2372 video_ids = []
2373 already_seen = set()
2374 pagenum = 1
2375
2376 while True:
2377 self.report_download_page(query, pagenum)
2378 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2379 request = urllib2.Request(result_url)
2380 try:
2381 page = urllib2.urlopen(request).read()
2382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2383 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2384 return
2385
2386 # Extract video identifiers
2387 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2388 video_id = mobj.group(1)
2389 if video_id not in already_seen:
2390 video_ids.append(video_id)
2391 already_seen.add(video_id)
2392 if len(video_ids) == n:
2393 # Specified n videos reached
2394 for id in video_ids:
2395 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2396 return
2397
2398 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2399 for id in video_ids:
2400 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2401 return
2402
2403 pagenum = pagenum + 1
2404
2405 class YoutubePlaylistIE(InfoExtractor):
2406 """Information Extractor for YouTube playlists."""
2407
2408 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2409 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2410 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2411 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2412 _youtube_ie = None
2413
2414 def __init__(self, youtube_ie, downloader=None):
2415 InfoExtractor.__init__(self, downloader)
2416 self._youtube_ie = youtube_ie
2417
2418 @staticmethod
2419 def suitable(url):
2420 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2421
2422 def report_download_page(self, playlist_id, pagenum):
2423 """Report attempt to download playlist page with given number."""
2424 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2425
2426 def _real_initialize(self):
2427 self._youtube_ie.initialize()
2428
2429 def _real_extract(self, url):
2430 # Extract playlist id
2431 mobj = re.match(self._VALID_URL, url)
2432 if mobj is None:
2433 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2434 return
2435
2436 # Single video case
2437 if mobj.group(3) is not None:
2438 self._youtube_ie.extract(mobj.group(3))
2439 return
2440
2441 # Download playlist pages
2442 # prefix is 'p' as default for playlists but there are other types that need extra care
2443 playlist_prefix = mobj.group(1)
2444 if playlist_prefix == 'a':
2445 playlist_access = 'artist'
2446 else:
2447 playlist_prefix = 'p'
2448 playlist_access = 'view_play_list'
2449 playlist_id = mobj.group(2)
2450 video_ids = []
2451 pagenum = 1
2452
2453 while True:
2454 self.report_download_page(playlist_id, pagenum)
2455 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2456 try:
2457 page = urllib2.urlopen(request).read()
2458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2460 return
2461
2462 # Extract video identifiers
2463 ids_in_page = []
2464 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465 if mobj.group(1) not in ids_in_page:
2466 ids_in_page.append(mobj.group(1))
2467 video_ids.extend(ids_in_page)
2468
2469 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2470 break
2471 pagenum = pagenum + 1
2472
2473 playliststart = self._downloader.params.get('playliststart', 1) - 1
2474 playlistend = self._downloader.params.get('playlistend', -1)
2475 video_ids = video_ids[playliststart:playlistend]
2476
2477 for id in video_ids:
2478 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2479 return
2480
2481 class YoutubeUserIE(InfoExtractor):
2482 """Information Extractor for YouTube users."""
2483
2484 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2485 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2486 _GDATA_PAGE_SIZE = 50
2487 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2488 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489 _youtube_ie = None
2490
2491 def __init__(self, youtube_ie, downloader=None):
2492 InfoExtractor.__init__(self, downloader)
2493 self._youtube_ie = youtube_ie
2494
2495 @staticmethod
2496 def suitable(url):
2497 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2498
2499 def report_download_page(self, username, start_index):
2500 """Report attempt to download user page."""
2501 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2502 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2503
2504 def _real_initialize(self):
2505 self._youtube_ie.initialize()
2506
2507 def _real_extract(self, url):
2508 # Extract username
2509 mobj = re.match(self._VALID_URL, url)
2510 if mobj is None:
2511 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2512 return
2513
2514 username = mobj.group(1)
2515
2516 # Download video ids using YouTube Data API. Result size per
2517 # query is limited (currently to 50 videos) so we need to query
2518 # page by page until there are no video ids - it means we got
2519 # all of them.
2520
2521 video_ids = []
2522 pagenum = 0
2523
2524 while True:
2525 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2526 self.report_download_page(username, start_index)
2527
2528 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2529
2530 try:
2531 page = urllib2.urlopen(request).read()
2532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2533 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2534 return
2535
2536 # Extract video identifiers
2537 ids_in_page = []
2538
2539 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540 if mobj.group(1) not in ids_in_page:
2541 ids_in_page.append(mobj.group(1))
2542
2543 video_ids.extend(ids_in_page)
2544
2545 # A little optimization - if current page is not
2546 # "full", ie. does not contain PAGE_SIZE video ids then
2547 # we can assume that this page is the last one - there
2548 # are no more ids on further pages - no need to query
2549 # again.
2550
2551 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2552 break
2553
2554 pagenum += 1
2555
2556 all_ids_count = len(video_ids)
2557 playliststart = self._downloader.params.get('playliststart', 1) - 1
2558 playlistend = self._downloader.params.get('playlistend', -1)
2559
2560 if playlistend == -1:
2561 video_ids = video_ids[playliststart:]
2562 else:
2563 video_ids = video_ids[playliststart:playlistend]
2564
2565 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2566 (username, all_ids_count, len(video_ids)))
2567
2568 for video_id in video_ids:
2569 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2570
2571
2572 class DepositFilesIE(InfoExtractor):
2573 """Information extractor for depositfiles.com"""
2574
2575 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2576
2577 def __init__(self, downloader=None):
2578 InfoExtractor.__init__(self, downloader)
2579
2580 @staticmethod
2581 def suitable(url):
2582 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2583
2584 def report_download_webpage(self, file_id):
2585 """Report webpage download."""
2586 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2587
2588 def report_extraction(self, file_id):
2589 """Report information extraction."""
2590 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2591
2592 def _real_initialize(self):
2593 return
2594
2595 def _real_extract(self, url):
2596 # At this point we have a new file
2597 self._downloader.increment_downloads()
2598
2599 file_id = url.split('/')[-1]
2600 # Rebuild url in english locale
2601 url = 'http://depositfiles.com/en/files/' + file_id
2602
2603 # Retrieve file webpage with 'Free download' button pressed
2604 free_download_indication = { 'gateway_result' : '1' }
2605 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2606 try:
2607 self.report_download_webpage(file_id)
2608 webpage = urllib2.urlopen(request).read()
2609 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2610 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2611 return
2612
2613 # Search for the real file URL
2614 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2615 if (mobj is None) or (mobj.group(1) is None):
2616 # Try to figure out reason of the error.
2617 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2618 if (mobj is not None) and (mobj.group(1) is not None):
2619 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2620 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2621 else:
2622 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2623 return
2624
2625 file_url = mobj.group(1)
2626 file_extension = os.path.splitext(file_url)[1][1:]
2627
2628 # Search for file title
2629 mobj = re.search(r'<b title="(.*?)">', webpage)
2630 if mobj is None:
2631 self._downloader.trouble(u'ERROR: unable to extract title')
2632 return
2633 file_title = mobj.group(1).decode('utf-8')
2634
2635 try:
2636 # Process file information
2637 self._downloader.process_info({
2638 'id': file_id.decode('utf-8'),
2639 'url': file_url.decode('utf-8'),
2640 'uploader': u'NA',
2641 'upload_date': u'NA',
2642 'title': file_title,
2643 'stitle': file_title,
2644 'ext': file_extension.decode('utf-8'),
2645 'format': u'NA',
2646 'player_url': None,
2647 })
2648 except UnavailableVideoError, err:
2649 self._downloader.trouble(u'ERROR: unable to download file')
2650
2651 class FacebookIE(InfoExtractor):
2652 """Information Extractor for Facebook"""
2653
2654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2655 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2656 _NETRC_MACHINE = 'facebook'
2657 _available_formats = ['highqual', 'lowqual']
2658 _video_extensions = {
2659 'highqual': 'mp4',
2660 'lowqual': 'mp4',
2661 }
2662
2663 def __init__(self, downloader=None):
2664 InfoExtractor.__init__(self, downloader)
2665
2666 @staticmethod
2667 def suitable(url):
2668 return (re.match(FacebookIE._VALID_URL, url) is not None)
2669
2670 def _reporter(self, message):
2671 """Add header and report message."""
2672 self._downloader.to_screen(u'[facebook] %s' % message)
2673
2674 def report_login(self):
2675 """Report attempt to log in."""
2676 self._reporter(u'Logging in')
2677
2678 def report_video_webpage_download(self, video_id):
2679 """Report attempt to download video webpage."""
2680 self._reporter(u'%s: Downloading video webpage' % video_id)
2681
2682 def report_information_extraction(self, video_id):
2683 """Report attempt to extract video information."""
2684 self._reporter(u'%s: Extracting video information' % video_id)
2685
2686 def _parse_page(self, video_webpage):
2687 """Extract video information from page"""
2688 # General data
2689 data = {'title': r'class="video_title datawrap">(.*?)</',
2690 'description': r'<div class="datawrap">(.*?)</div>',
2691 'owner': r'\("video_owner_name", "(.*?)"\)',
2692 'upload_date': r'data-date="(.*?)"',
2693 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2694 }
2695 video_info = {}
2696 for piece in data.keys():
2697 mobj = re.search(data[piece], video_webpage)
2698 if mobj is not None:
2699 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2700
2701 # Video urls
2702 video_urls = {}
2703 for fmt in self._available_formats:
2704 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2705 if mobj is not None:
2706 # URL is in a Javascript segment inside an escaped Unicode format within
2707 # the generally utf-8 page
2708 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2709 video_info['video_urls'] = video_urls
2710
2711 return video_info
2712
2713 def _real_initialize(self):
2714 if self._downloader is None:
2715 return
2716
2717 useremail = None
2718 password = None
2719 downloader_params = self._downloader.params
2720
2721 # Attempt to use provided username and password or .netrc data
2722 if downloader_params.get('username', None) is not None:
2723 useremail = downloader_params['username']
2724 password = downloader_params['password']
2725 elif downloader_params.get('usenetrc', False):
2726 try:
2727 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2728 if info is not None:
2729 useremail = info[0]
2730 password = info[2]
2731 else:
2732 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2733 except (IOError, netrc.NetrcParseError), err:
2734 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2735 return
2736
2737 if useremail is None:
2738 return
2739
2740 # Log in
2741 login_form = {
2742 'email': useremail,
2743 'pass': password,
2744 'login': 'Log+In'
2745 }
2746 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2747 try:
2748 self.report_login()
2749 login_results = urllib2.urlopen(request).read()
2750 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2751 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2752 return
2753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2755 return
2756
2757 def _real_extract(self, url):
2758 mobj = re.match(self._VALID_URL, url)
2759 if mobj is None:
2760 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2761 return
2762 video_id = mobj.group('ID')
2763
2764 # Get video webpage
2765 self.report_video_webpage_download(video_id)
2766 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2767 try:
2768 page = urllib2.urlopen(request)
2769 video_webpage = page.read()
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2772 return
2773
2774 # Start extracting information
2775 self.report_information_extraction(video_id)
2776
2777 # Extract information
2778 video_info = self._parse_page(video_webpage)
2779
2780 # uploader
2781 if 'owner' not in video_info:
2782 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2783 return
2784 video_uploader = video_info['owner']
2785
2786 # title
2787 if 'title' not in video_info:
2788 self._downloader.trouble(u'ERROR: unable to extract video title')
2789 return
2790 video_title = video_info['title']
2791 video_title = video_title.decode('utf-8')
2792 video_title = sanitize_title(video_title)
2793
2794 # simplified title
2795 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2796 simple_title = simple_title.strip(ur'_')
2797
2798 # thumbnail image
2799 if 'thumbnail' not in video_info:
2800 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2801 video_thumbnail = ''
2802 else:
2803 video_thumbnail = video_info['thumbnail']
2804
2805 # upload date
2806 upload_date = u'NA'
2807 if 'upload_date' in video_info:
2808 upload_time = video_info['upload_date']
2809 timetuple = email.utils.parsedate_tz(upload_time)
2810 if timetuple is not None:
2811 try:
2812 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2813 except:
2814 pass
2815
2816 # description
2817 video_description = video_info.get('description', 'No description available.')
2818
2819 url_map = video_info['video_urls']
2820 if len(url_map.keys()) > 0:
2821 # Decide which formats to download
2822 req_format = self._downloader.params.get('format', None)
2823 format_limit = self._downloader.params.get('format_limit', None)
2824
2825 if format_limit is not None and format_limit in self._available_formats:
2826 format_list = self._available_formats[self._available_formats.index(format_limit):]
2827 else:
2828 format_list = self._available_formats
2829 existing_formats = [x for x in format_list if x in url_map]
2830 if len(existing_formats) == 0:
2831 self._downloader.trouble(u'ERROR: no known formats available for video')
2832 return
2833 if req_format is None:
2834 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2835 elif req_format == '-1':
2836 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2837 else:
2838 # Specific format
2839 if req_format not in url_map:
2840 self._downloader.trouble(u'ERROR: requested format not available')
2841 return
2842 video_url_list = [(req_format, url_map[req_format])] # Specific format
2843
2844 for format_param, video_real_url in video_url_list:
2845
2846 # At this point we have a new video
2847 self._downloader.increment_downloads()
2848
2849 # Extension
2850 video_extension = self._video_extensions.get(format_param, 'mp4')
2851
2852 try:
2853 # Process video information
2854 self._downloader.process_info({
2855 'id': video_id.decode('utf-8'),
2856 'url': video_real_url.decode('utf-8'),
2857 'uploader': video_uploader.decode('utf-8'),
2858 'upload_date': upload_date,
2859 'title': video_title,
2860 'stitle': simple_title,
2861 'ext': video_extension.decode('utf-8'),
2862 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2863 'thumbnail': video_thumbnail.decode('utf-8'),
2864 'description': video_description.decode('utf-8'),
2865 'player_url': None,
2866 })
2867 except UnavailableVideoError, err:
2868 self._downloader.trouble(u'\nERROR: unable to download video')
2869
2870 class BlipTVIE(InfoExtractor):
2871 """Information extractor for blip.tv"""
2872
2873 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2874 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2875
2876 @staticmethod
2877 def suitable(url):
2878 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2879
2880 def report_extraction(self, file_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2883
2884 def _simplify_title(self, title):
2885 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2886 res = res.strip(ur'_')
2887 return res
2888
2889 def _real_extract(self, url):
2890 mobj = re.match(self._VALID_URL, url)
2891 if mobj is None:
2892 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2893 return
2894
2895 if '?' in url:
2896 cchar = '&'
2897 else:
2898 cchar = '?'
2899 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2900 request = urllib2.Request(json_url)
2901 self.report_extraction(mobj.group(1))
2902 try:
2903 json_code = urllib2.urlopen(request).read()
2904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2905 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2906 return
2907 try:
2908 json_data = json.loads(json_code)
2909 if 'Post' in json_data:
2910 data = json_data['Post']
2911 else:
2912 data = json_data
2913
2914 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2915 video_url = data['media']['url']
2916 umobj = re.match(self._URL_EXT, video_url)
2917 if umobj is None:
2918 raise ValueError('Can not determine filename extension')
2919 ext = umobj.group(1)
2920
2921 self._downloader.increment_downloads()
2922
2923 info = {
2924 'id': data['item_id'],
2925 'url': video_url,
2926 'uploader': data['display_name'],
2927 'upload_date': upload_date,
2928 'title': data['title'],
2929 'stitle': self._simplify_title(data['title']),
2930 'ext': ext,
2931 'format': data['media']['mimeType'],
2932 'thumbnail': data['thumbnailUrl'],
2933 'description': data['description'],
2934 'player_url': data['embedUrl']
2935 }
2936 except (ValueError,KeyError), err:
2937 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2938 return
2939
2940 try:
2941 self._downloader.process_info(info)
2942 except UnavailableVideoError, err:
2943 self._downloader.trouble(u'\nERROR: unable to download video')
2944
2945
2946 class PostProcessor(object):
2947 """Post Processor class.
2948
2949 PostProcessor objects can be added to downloaders with their
2950 add_post_processor() method. When the downloader has finished a
2951 successful download, it will take its internal chain of PostProcessors
2952 and start calling the run() method on each one of them, first with
2953 an initial argument and then with the returned value of the previous
2954 PostProcessor.
2955
2956 The chain will be stopped if one of them ever returns None or the end
2957 of the chain is reached.
2958
2959 PostProcessor objects follow a "mutual registration" process similar
2960 to InfoExtractor objects.
2961 """
2962
2963 _downloader = None
2964
2965 def __init__(self, downloader=None):
2966 self._downloader = downloader
2967
2968 def set_downloader(self, downloader):
2969 """Sets the downloader for this PP."""
2970 self._downloader = downloader
2971
2972 def run(self, information):
2973 """Run the PostProcessor.
2974
2975 The "information" argument is a dictionary like the ones
2976 composed by InfoExtractors. The only difference is that this
2977 one has an extra field called "filepath" that points to the
2978 downloaded file.
2979
2980 When this method returns None, the postprocessing chain is
2981 stopped. However, this method may return an information
2982 dictionary that will be passed to the next postprocessing
2983 object in the chain. It can be the one it received after
2984 changing some fields.
2985
2986 In addition, this method may raise a PostProcessingError
2987 exception that will be taken into account by the downloader
2988 it was called from.
2989 """
2990 return information # by default, do nothing
2991
2992 class FFmpegExtractAudioPP(PostProcessor):
2993
2994 def __init__(self, downloader=None, preferredcodec=None):
2995 PostProcessor.__init__(self, downloader)
2996 if preferredcodec is None:
2997 preferredcodec = 'best'
2998 self._preferredcodec = preferredcodec
2999
3000 @staticmethod
3001 def get_audio_codec(path):
3002 try:
3003 cmd = ['ffprobe', '-show_streams', '--', path]
3004 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3005 output = handle.communicate()[0]
3006 if handle.wait() != 0:
3007 return None
3008 except (IOError, OSError):
3009 return None
3010 audio_codec = None
3011 for line in output.split('\n'):
3012 if line.startswith('codec_name='):
3013 audio_codec = line.split('=')[1].strip()
3014 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3015 return audio_codec
3016 return None
3017
3018 @staticmethod
3019 def run_ffmpeg(path, out_path, codec, more_opts):
3020 try:
3021 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3022 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3023 return (ret == 0)
3024 except (IOError, OSError):
3025 return False
3026
3027 def run(self, information):
3028 path = information['filepath']
3029
3030 filecodec = self.get_audio_codec(path)
3031 if filecodec is None:
3032 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3033 return None
3034
3035 more_opts = []
3036 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3037 if filecodec == 'aac' or filecodec == 'mp3':
3038 # Lossless if possible
3039 acodec = 'copy'
3040 extension = filecodec
3041 if filecodec == 'aac':
3042 more_opts = ['-f', 'adts']
3043 else:
3044 # MP3 otherwise.
3045 acodec = 'libmp3lame'
3046 extension = 'mp3'
3047 more_opts = ['-ab', '128k']
3048 else:
3049 # We convert the audio (lossy)
3050 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3051 extension = self._preferredcodec
3052 more_opts = ['-ab', '128k']
3053 if self._preferredcodec == 'aac':
3054 more_opts += ['-f', 'adts']
3055
3056 (prefix, ext) = os.path.splitext(path)
3057 new_path = prefix + '.' + extension
3058 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3059 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3060
3061 if not status:
3062 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3063 return None
3064
3065 try:
3066 os.remove(path)
3067 except (IOError, OSError):
3068 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3069 return None
3070
3071 information['filepath'] = new_path
3072 return information
3073
3074
3075 def updateSelf(downloader, filename):
3076 ''' Update the program file with the latest version from the repository '''
3077 # Note: downloader only used for options
3078 if not os.access(filename, os.W_OK):
3079 sys.exit('ERROR: no write permissions on %s' % filename)
3080
3081 downloader.to_screen('Updating to latest version...')
3082
3083 try:
3084 try:
3085 urlh = urllib.urlopen(UPDATE_URL)
3086 newcontent = urlh.read()
3087 finally:
3088 urlh.close()
3089 except (IOError, OSError), err:
3090 sys.exit('ERROR: unable to download latest version')
3091
3092 try:
3093 outf = open(filename, 'wb')
3094 try:
3095 outf.write(newcontent)
3096 finally:
3097 outf.close()
3098 except (IOError, OSError), err:
3099 sys.exit('ERROR: unable to overwrite current version')
3100
3101 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3102
3103 def parseOpts():
3104 # Deferred imports
3105 import getpass
3106 import optparse
3107
3108 def _format_option_string(option):
3109 ''' ('-o', '--option') -> -o, --format METAVAR'''
3110
3111 opts = []
3112
3113 if option._short_opts: opts.append(option._short_opts[0])
3114 if option._long_opts: opts.append(option._long_opts[0])
3115 if len(opts) > 1: opts.insert(1, ', ')
3116
3117 if option.takes_value(): opts.append(' %s' % option.metavar)
3118
3119 return "".join(opts)
3120
3121 def _find_term_columns():
3122 columns = os.environ.get('COLUMNS', None)
3123 if columns:
3124 return int(columns)
3125
3126 try:
3127 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3128 out,err = sp.communicate()
3129 return int(out.split()[1])
3130 except:
3131 pass
3132 return None
3133
3134 max_width = 80
3135 max_help_position = 80
3136
3137 # No need to wrap help messages if we're on a wide console
3138 columns = _find_term_columns()
3139 if columns: max_width = columns
3140
3141 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3142 fmt.format_option_strings = _format_option_string
3143
3144 kw = {
3145 'version' : __version__,
3146 'formatter' : fmt,
3147 'usage' : '%prog [options] url...',
3148 'conflict_handler' : 'resolve',
3149 }
3150
3151 parser = optparse.OptionParser(**kw)
3152
3153 # option groups
3154 general = optparse.OptionGroup(parser, 'General Options')
3155 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3156 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3157 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3158 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3159 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3160
3161 general.add_option('-h', '--help',
3162 action='help', help='print this help text and exit')
3163 general.add_option('-v', '--version',
3164 action='version', help='print program version and exit')
3165 general.add_option('-U', '--update',
3166 action='store_true', dest='update_self', help='update this program to latest version')
3167 general.add_option('-i', '--ignore-errors',
3168 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3169 general.add_option('-r', '--rate-limit',
3170 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3171 general.add_option('-R', '--retries',
3172 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3173 general.add_option('--playlist-start',
3174 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3175 general.add_option('--playlist-end',
3176 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3177 general.add_option('--dump-user-agent',
3178 action='store_true', dest='dump_user_agent',
3179 help='display the current browser identification', default=False)
3180
3181 authentication.add_option('-u', '--username',
3182 dest='username', metavar='USERNAME', help='account username')
3183 authentication.add_option('-p', '--password',
3184 dest='password', metavar='PASSWORD', help='account password')
3185 authentication.add_option('-n', '--netrc',
3186 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3187
3188
3189 video_format.add_option('-f', '--format',
3190 action='store', dest='format', metavar='FORMAT', help='video format code')
3191 video_format.add_option('--all-formats',
3192 action='store_const', dest='format', help='download all available video formats', const='-1')
3193 video_format.add_option('--max-quality',
3194 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3195
3196
3197 verbosity.add_option('-q', '--quiet',
3198 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3199 verbosity.add_option('-s', '--simulate',
3200 action='store_true', dest='simulate', help='do not download video', default=False)
3201 verbosity.add_option('-g', '--get-url',
3202 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3203 verbosity.add_option('-e', '--get-title',
3204 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3205 verbosity.add_option('--get-thumbnail',
3206 action='store_true', dest='getthumbnail',
3207 help='simulate, quiet but print thumbnail URL', default=False)
3208 verbosity.add_option('--get-description',
3209 action='store_true', dest='getdescription',
3210 help='simulate, quiet but print video description', default=False)
3211 verbosity.add_option('--get-filename',
3212 action='store_true', dest='getfilename',
3213 help='simulate, quiet but print output filename', default=False)
3214 verbosity.add_option('--no-progress',
3215 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3216 verbosity.add_option('--console-title',
3217 action='store_true', dest='consoletitle',
3218 help='display progress in console titlebar', default=False)
3219
3220
3221 filesystem.add_option('-t', '--title',
3222 action='store_true', dest='usetitle', help='use title in file name', default=False)
3223 filesystem.add_option('-l', '--literal',
3224 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3225 filesystem.add_option('-A', '--auto-number',
3226 action='store_true', dest='autonumber',
3227 help='number downloaded files starting from 00000', default=False)
3228 filesystem.add_option('-o', '--output',
3229 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3230 filesystem.add_option('-a', '--batch-file',
3231 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3232 filesystem.add_option('-w', '--no-overwrites',
3233 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3234 filesystem.add_option('-c', '--continue',
3235 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3236 filesystem.add_option('--cookies',
3237 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3238 filesystem.add_option('--no-part',
3239 action='store_true', dest='nopart', help='do not use .part files', default=False)
3240 filesystem.add_option('--no-mtime',
3241 action='store_false', dest='updatetime',
3242 help='do not use the Last-modified header to set the file modification time', default=True)
3243 filesystem.add_option('--write-description',
3244 action='store_true', dest='writedescription',
3245 help='write video description to a .description file', default=False)
3246 filesystem.add_option('--write-info-json',
3247 action='store_true', dest='writeinfojson',
3248 help='write video metadata to a .info.json file', default=False)
3249
3250
3251 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3252 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3253 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3254 help='"best", "aac" or "mp3"; best by default')
3255
3256
3257 parser.add_option_group(general)
3258 parser.add_option_group(filesystem)
3259 parser.add_option_group(verbosity)
3260 parser.add_option_group(video_format)
3261 parser.add_option_group(authentication)
3262 parser.add_option_group(postproc)
3263
3264 opts, args = parser.parse_args()
3265
3266 return parser, opts, args
3267
3268 def main():
3269 parser, opts, args = parseOpts()
3270
3271 # Open appropriate CookieJar
3272 if opts.cookiefile is None:
3273 jar = cookielib.CookieJar()
3274 else:
3275 try:
3276 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3277 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3278 jar.load()
3279 except (IOError, OSError), err:
3280 sys.exit(u'ERROR: unable to open cookie file')
3281
3282 # Dump user agent
3283 if opts.dump_user_agent:
3284 print std_headers['User-Agent']
3285 sys.exit(0)
3286
3287 # General configuration
3288 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3289 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3290 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3291
3292 # Batch file verification
3293 batchurls = []
3294 if opts.batchfile is not None:
3295 try:
3296 if opts.batchfile == '-':
3297 batchfd = sys.stdin
3298 else:
3299 batchfd = open(opts.batchfile, 'r')
3300 batchurls = batchfd.readlines()
3301 batchurls = [x.strip() for x in batchurls]
3302 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3303 except IOError:
3304 sys.exit(u'ERROR: batch file could not be read')
3305 all_urls = batchurls + args
3306
3307 # Conflicting, missing and erroneous options
3308 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3309 parser.error(u'using .netrc conflicts with giving username/password')
3310 if opts.password is not None and opts.username is None:
3311 parser.error(u'account username missing')
3312 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3313 parser.error(u'using output template conflicts with using title, literal title or auto number')
3314 if opts.usetitle and opts.useliteral:
3315 parser.error(u'using title conflicts with using literal title')
3316 if opts.username is not None and opts.password is None:
3317 opts.password = getpass.getpass(u'Type account password and press return:')
3318 if opts.ratelimit is not None:
3319 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3320 if numeric_limit is None:
3321 parser.error(u'invalid rate limit specified')
3322 opts.ratelimit = numeric_limit
3323 if opts.retries is not None:
3324 try:
3325 opts.retries = long(opts.retries)
3326 except (TypeError, ValueError), err:
3327 parser.error(u'invalid retry count specified')
3328 try:
3329 opts.playliststart = int(opts.playliststart)
3330 if opts.playliststart <= 0:
3331 raise ValueError(u'Playlist start must be positive')
3332 except (TypeError, ValueError), err:
3333 parser.error(u'invalid playlist start number specified')
3334 try:
3335 opts.playlistend = int(opts.playlistend)
3336 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3337 raise ValueError(u'Playlist end must be greater than playlist start')
3338 except (TypeError, ValueError), err:
3339 parser.error(u'invalid playlist end number specified')
3340 if opts.extractaudio:
3341 if opts.audioformat not in ['best', 'aac', 'mp3']:
3342 parser.error(u'invalid audio format specified')
3343
3344 # Information extractors
3345 youtube_ie = YoutubeIE()
3346 metacafe_ie = MetacafeIE(youtube_ie)
3347 dailymotion_ie = DailymotionIE()
3348 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3349 youtube_user_ie = YoutubeUserIE(youtube_ie)
3350 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3351 google_ie = GoogleIE()
3352 google_search_ie = GoogleSearchIE(google_ie)
3353 photobucket_ie = PhotobucketIE()
3354 yahoo_ie = YahooIE()
3355 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3356 deposit_files_ie = DepositFilesIE()
3357 facebook_ie = FacebookIE()
3358 bliptv_ie = BlipTVIE()
3359 vimeo_ie = VimeoIE()
3360 generic_ie = GenericIE()
3361
3362 # File downloader
3363 fd = FileDownloader({
3364 'usenetrc': opts.usenetrc,
3365 'username': opts.username,
3366 'password': opts.password,
3367 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3368 'forceurl': opts.geturl,
3369 'forcetitle': opts.gettitle,
3370 'forcethumbnail': opts.getthumbnail,
3371 'forcedescription': opts.getdescription,
3372 'forcefilename': opts.getfilename,
3373 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3374 'format': opts.format,
3375 'format_limit': opts.format_limit,
3376 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3377 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3378 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3379 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3380 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3381 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3382 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3383 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3384 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3385 or u'%(id)s.%(ext)s'),
3386 'ignoreerrors': opts.ignoreerrors,
3387 'ratelimit': opts.ratelimit,
3388 'nooverwrites': opts.nooverwrites,
3389 'retries': opts.retries,
3390 'continuedl': opts.continue_dl,
3391 'noprogress': opts.noprogress,
3392 'playliststart': opts.playliststart,
3393 'playlistend': opts.playlistend,
3394 'logtostderr': opts.outtmpl == '-',
3395 'consoletitle': opts.consoletitle,
3396 'nopart': opts.nopart,
3397 'updatetime': opts.updatetime,
3398 'writedescription': opts.writedescription,
3399 'writeinfojson': opts.writeinfojson,
3400 })
3401 fd.add_info_extractor(youtube_search_ie)
3402 fd.add_info_extractor(youtube_pl_ie)
3403 fd.add_info_extractor(youtube_user_ie)
3404 fd.add_info_extractor(metacafe_ie)
3405 fd.add_info_extractor(dailymotion_ie)
3406 fd.add_info_extractor(youtube_ie)
3407 fd.add_info_extractor(google_ie)
3408 fd.add_info_extractor(google_search_ie)
3409 fd.add_info_extractor(photobucket_ie)
3410 fd.add_info_extractor(yahoo_ie)
3411 fd.add_info_extractor(yahoo_search_ie)
3412 fd.add_info_extractor(deposit_files_ie)
3413 fd.add_info_extractor(facebook_ie)
3414 fd.add_info_extractor(bliptv_ie)
3415 fd.add_info_extractor(vimeo_ie)
3416
3417 # This must come last since it's the
3418 # fallback if none of the others work
3419 fd.add_info_extractor(generic_ie)
3420
3421 # PostProcessors
3422 if opts.extractaudio:
3423 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3424
3425 # Update version
3426 if opts.update_self:
3427 updateSelf(fd, sys.argv[0])
3428
3429 # Maybe do nothing
3430 if len(all_urls) < 1:
3431 if not opts.update_self:
3432 parser.error(u'you must provide at least one URL')
3433 else:
3434 sys.exit()
3435 retcode = fd.download(all_urls)
3436
3437 # Dump cookie jar if requested
3438 if opts.cookiefile is not None:
3439 try:
3440 jar.save()
3441 except (IOError, OSError), err:
3442 sys.exit(u'ERROR: unable to save cookie jar')
3443
3444 sys.exit(retcode)
3445
3446
3447 if __name__ == '__main__':
3448 try:
3449 main()
3450 except DownloadError:
3451 sys.exit(1)
3452 except SameFileError:
3453 sys.exit(u'ERROR: fixed output name but more than one file to download')
3454 except KeyboardInterrupt:
3455 sys.exit(u'\nERROR: Interrupted by user')
3456
3457 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: