]> jfr.im git - yt-dlp.git/blob - youtube-dl
Fix the DailymotionIE to parse the new title of a webpage
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48 import ctypes
49
50 try:
51 import email.utils
52 except ImportError: # Python 2.4
53 import email.Utils
54 try:
55 import cStringIO as StringIO
56 except ImportError:
57 import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61 from urlparse import parse_qs
62 except ImportError:
63 from cgi import parse_qs
64
65 try:
66 import lxml.etree
67 except ImportError:
68 pass # Handled below
69
70 try:
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84 import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86 import re
87 class json(object):
88 @staticmethod
89 def loads(s):
90 s = s.decode('UTF-8')
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
95 i += 1
96 if expectMore:
97 if i >= len(s):
98 raiseError('Premature end', i)
99 return i
100 def decodeEscape(match):
101 esc = match.group(1)
102 _STATIC = {
103 '"': '"',
104 '\\': '\\',
105 '/': '/',
106 'b': unichr(0x8),
107 'f': unichr(0xc),
108 'n': '\n',
109 'r': '\r',
110 't': '\t',
111 }
112 if esc in _STATIC:
113 return _STATIC[esc]
114 if esc[0] == 'u':
115 if len(esc) == 1+4:
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
122 def parseString(i):
123 i += 1
124 e = i
125 while True:
126 e = s.index('"', e)
127 bslashes = 0
128 while s[e-bslashes-1] == '\\':
129 bslashes += 1
130 if bslashes % 2 == 1:
131 e += 1
132 continue
133 break
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
136 return (e+1,stri)
137 def parseObj(i):
138 i += 1
139 res = {}
140 i = skipSpace(i)
141 if s[i] == '}': # Empty dictionary
142 return (i+1,res)
143 while True:
144 if s[i] != '"':
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
147 i = skipSpace(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
150 i,val = parse(i+1)
151 res[key] = val
152 i = skipSpace(i)
153 if s[i] == '}':
154 return (i+1, res)
155 if s[i] != ',':
156 raiseError('Expected comma or closing curly brace', i)
157 i = skipSpace(i+1)
158 def parseArray(i):
159 res = []
160 i = skipSpace(i+1)
161 if s[i] == ']': # Empty array
162 return (i+1,res)
163 while True:
164 i,val = parse(i)
165 res.append(val)
166 i = skipSpace(i) # Raise exception if premature end
167 if s[i] == ']':
168 return (i+1, res)
169 if s[i] != ',':
170 raiseError('Expected a comma or closing bracket', i)
171 i = skipSpace(i+1)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
175 return (i+len(k), v)
176 raiseError('Not a boolean (or null)', i)
177 def parseNumber(i):
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 if mobj is None:
180 raiseError('Not a number', i)
181 nums = mobj.group(1)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186 def parse(i):
187 i = skipSpace(i)
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
190 return (i,res)
191 i,res = parse(0)
192 if i < len(s):
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194 return res
195
196 def preferredencoding():
197 """Get preferred encoding.
198
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
201 """
202 def yield_preferredencoding():
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
208 while True:
209 yield pref
210 return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
215
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
218 """
219 entity = matchobj.group(1)
220
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
224
225 # Unicode character
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 if mobj is not None:
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
230 base = 16
231 numstr = u'0%s' % numstr
232 else:
233 base = 10
234 return unichr(long(numstr, base))
235
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
248
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
252 function.
253
254 It returns the tuple (stream, definitive_file_name).
255 """
256 try:
257 if filename == u'-':
258 if sys.platform == 'win32':
259 import msvcrt
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
271
272
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
275 timestamp = None
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
279 return timestamp
280
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
284
285 class DownloadError(Exception):
286 """Download Error exception.
287
288 This exception may be thrown by FileDownloader objects if they are not
289 configured to continue on errors. They will contain the appropriate
290 error message.
291 """
292 pass
293
294
295 class SameFileError(Exception):
296 """Same File exception.
297
298 This exception will be thrown by FileDownloader objects if they detect
299 multiple files would have to be downloaded to the same file on disk.
300 """
301 pass
302
303
304 class PostProcessingError(Exception):
305 """Post Processing exception.
306
307 This exception may be raised by PostProcessor's .run() method to
308 indicate an error in the postprocessing task.
309 """
310 pass
311
312
313 class UnavailableVideoError(Exception):
314 """Unavailable Format exception.
315
316 This exception will be thrown when a video is requested
317 in a format that is not available for that video.
318 """
319 pass
320
321
322 class ContentTooShortError(Exception):
323 """Content Too Short exception.
324
325 This exception may be raised by FileDownloader objects when a file they
326 download is too small for what the server announced first, indicating
327 the connection was probably interrupted.
328 """
329 # Both in bytes
330 downloaded = None
331 expected = None
332
333 def __init__(self, downloaded, expected):
334 self.downloaded = downloaded
335 self.expected = expected
336
337
338 class YoutubeDLHandler(urllib2.HTTPHandler):
339 """Handler for HTTP requests and responses.
340
341 This class, when installed with an OpenerDirector, automatically adds
342 the standard headers to every HTTP request and handles gzipped and
343 deflated responses from web servers. If compression is to be avoided in
344 a particular request, the original request in the program code only has
345 to include the HTTP header "Youtubedl-No-Compression", which will be
346 removed before making the real request.
347
348 Part of this code was copied from:
349
350 http://techknack.net/python-urllib2-handlers/
351
352 Andrew Rowls, the author of that code, agreed to release it to the
353 public domain.
354 """
355
356 @staticmethod
357 def deflate(data):
358 try:
359 return zlib.decompress(data, -zlib.MAX_WBITS)
360 except zlib.error:
361 return zlib.decompress(data)
362
363 @staticmethod
364 def addinfourl_wrapper(stream, headers, url, code):
365 if hasattr(urllib2.addinfourl, 'getcode'):
366 return urllib2.addinfourl(stream, headers, url, code)
367 ret = urllib2.addinfourl(stream, headers, url)
368 ret.code = code
369 return ret
370
371 def http_request(self, req):
372 for h in std_headers:
373 if h in req.headers:
374 del req.headers[h]
375 req.add_header(h, std_headers[h])
376 if 'Youtubedl-no-compression' in req.headers:
377 if 'Accept-encoding' in req.headers:
378 del req.headers['Accept-encoding']
379 del req.headers['Youtubedl-no-compression']
380 return req
381
382 def http_response(self, req, resp):
383 old_resp = resp
384 # gzip
385 if resp.headers.get('Content-encoding', '') == 'gzip':
386 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
387 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
388 resp.msg = old_resp.msg
389 # deflate
390 if resp.headers.get('Content-encoding', '') == 'deflate':
391 gz = StringIO.StringIO(self.deflate(resp.read()))
392 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
393 resp.msg = old_resp.msg
394 return resp
395
396
397 class FileDownloader(object):
398 """File Downloader class.
399
400 File downloader objects are the ones responsible of downloading the
401 actual video file and writing it to disk if the user has requested
402 it, among some other tasks. In most cases there should be one per
403 program. As, given a video URL, the downloader doesn't know how to
404 extract all the needed information, task that InfoExtractors do, it
405 has to pass the URL to one of them.
406
407 For this, file downloader objects have a method that allows
408 InfoExtractors to be registered in a given order. When it is passed
409 a URL, the file downloader handles it to the first InfoExtractor it
410 finds that reports being able to handle it. The InfoExtractor extracts
411 all the information about the video or videos the URL refers to, and
412 asks the FileDownloader to process the video information, possibly
413 downloading the video.
414
415 File downloaders accept a lot of parameters. In order not to saturate
416 the object constructor with arguments, it receives a dictionary of
417 options instead. These options are available through the params
418 attribute for the InfoExtractors to use. The FileDownloader also
419 registers itself as the downloader in charge for the InfoExtractors
420 that are added to it, so this is a "mutual registration".
421
422 Available options:
423
424 username: Username for authentication purposes.
425 password: Password for authentication purposes.
426 usenetrc: Use netrc for authentication instead.
427 quiet: Do not print messages to stdout.
428 forceurl: Force printing final URL.
429 forcetitle: Force printing title.
430 forcethumbnail: Force printing thumbnail URL.
431 forcedescription: Force printing description.
432 forcefilename: Force printing final filename.
433 simulate: Do not download the video files.
434 format: Video format code.
435 format_limit: Highest quality format to try.
436 outtmpl: Template for output names.
437 ignoreerrors: Do not stop on download errors.
438 ratelimit: Download speed limit, in bytes/sec.
439 nooverwrites: Prevent overwriting files.
440 retries: Number of times to retry for HTTP error 5xx
441 continuedl: Try to continue downloads if possible.
442 noprogress: Do not print the progress bar.
443 playliststart: Playlist item to start at.
444 playlistend: Playlist item to end at.
445 matchtitle: Download only matching titles.
446 rejecttitle: Reject downloads for matching titles.
447 logtostderr: Log messages to stderr instead of stdout.
448 consoletitle: Display progress in console window's titlebar.
449 nopart: Do not use temporary .part files.
450 updatetime: Use the Last-modified header to set output file timestamps.
451 writedescription: Write the video description to a .description file
452 writeinfojson: Write the video description to a .info.json file
453 """
454
455 params = None
456 _ies = []
457 _pps = []
458 _download_retcode = None
459 _num_downloads = None
460 _screen_file = None
461
462 def __init__(self, params):
463 """Create a FileDownloader object with the given options."""
464 self._ies = []
465 self._pps = []
466 self._download_retcode = 0
467 self._num_downloads = 0
468 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
469 self.params = params
470
471 @staticmethod
472 def format_bytes(bytes):
473 if bytes is None:
474 return 'N/A'
475 if type(bytes) is str:
476 bytes = float(bytes)
477 if bytes == 0.0:
478 exponent = 0
479 else:
480 exponent = long(math.log(bytes, 1024.0))
481 suffix = 'bkMGTPEZY'[exponent]
482 converted = float(bytes) / float(1024 ** exponent)
483 return '%.2f%s' % (converted, suffix)
484
485 @staticmethod
486 def calc_percent(byte_counter, data_len):
487 if data_len is None:
488 return '---.-%'
489 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
490
491 @staticmethod
492 def calc_eta(start, now, total, current):
493 if total is None:
494 return '--:--'
495 dif = now - start
496 if current == 0 or dif < 0.001: # One millisecond
497 return '--:--'
498 rate = float(current) / dif
499 eta = long((float(total) - float(current)) / rate)
500 (eta_mins, eta_secs) = divmod(eta, 60)
501 if eta_mins > 99:
502 return '--:--'
503 return '%02d:%02d' % (eta_mins, eta_secs)
504
505 @staticmethod
506 def calc_speed(start, now, bytes):
507 dif = now - start
508 if bytes == 0 or dif < 0.001: # One millisecond
509 return '%10s' % '---b/s'
510 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
511
512 @staticmethod
513 def best_block_size(elapsed_time, bytes):
514 new_min = max(bytes / 2.0, 1.0)
515 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516 if elapsed_time < 0.001:
517 return long(new_max)
518 rate = bytes / elapsed_time
519 if rate > new_max:
520 return long(new_max)
521 if rate < new_min:
522 return long(new_min)
523 return long(rate)
524
525 @staticmethod
526 def parse_bytes(bytestr):
527 """Parse a string indicating a byte quantity into a long integer."""
528 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
529 if matchobj is None:
530 return None
531 number = float(matchobj.group(1))
532 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533 return long(round(number * multiplier))
534
535 def add_info_extractor(self, ie):
536 """Add an InfoExtractor object to the end of the list."""
537 self._ies.append(ie)
538 ie.set_downloader(self)
539
540 def add_post_processor(self, pp):
541 """Add a PostProcessor object to the end of the chain."""
542 self._pps.append(pp)
543 pp.set_downloader(self)
544
545 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
546 """Print message to stdout if not in quiet mode."""
547 try:
548 if not self.params.get('quiet', False):
549 terminator = [u'\n', u''][skip_eol]
550 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551 self._screen_file.flush()
552 except (UnicodeEncodeError), err:
553 if not ignore_encoding_errors:
554 raise
555
556 def to_stderr(self, message):
557 """Print message to stderr."""
558 print >>sys.stderr, message.encode(preferredencoding())
559
560 def to_cons_title(self, message):
561 """Set console/terminal window title to message."""
562 if not self.params.get('consoletitle', False):
563 return
564 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565 # c_wchar_p() might not be necessary if `message` is
566 # already of type unicode()
567 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568 elif 'TERM' in os.environ:
569 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
570
571 def fixed_template(self):
572 """Checks if the output template is fixed."""
573 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
574
575 def trouble(self, message=None):
576 """Determine action to take when a download problem appears.
577
578 Depending on if the downloader has been configured to ignore
579 download errors or not, this method may throw an exception or
580 not when errors are found, after printing the message.
581 """
582 if message is not None:
583 self.to_stderr(message)
584 if not self.params.get('ignoreerrors', False):
585 raise DownloadError(message)
586 self._download_retcode = 1
587
588 def slow_down(self, start_time, byte_counter):
589 """Sleep if the download speed is over the rate limit."""
590 rate_limit = self.params.get('ratelimit', None)
591 if rate_limit is None or byte_counter == 0:
592 return
593 now = time.time()
594 elapsed = now - start_time
595 if elapsed <= 0.0:
596 return
597 speed = float(byte_counter) / elapsed
598 if speed > rate_limit:
599 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
600
601 def temp_name(self, filename):
602 """Returns a temporary filename for the given filename."""
603 if self.params.get('nopart', False) or filename == u'-' or \
604 (os.path.exists(filename) and not os.path.isfile(filename)):
605 return filename
606 return filename + u'.part'
607
608 def undo_temp_name(self, filename):
609 if filename.endswith(u'.part'):
610 return filename[:-len(u'.part')]
611 return filename
612
613 def try_rename(self, old_filename, new_filename):
614 try:
615 if old_filename == new_filename:
616 return
617 os.rename(old_filename, new_filename)
618 except (IOError, OSError), err:
619 self.trouble(u'ERROR: unable to rename file')
620
621 def try_utime(self, filename, last_modified_hdr):
622 """Try to set the last-modified time of the given file."""
623 if last_modified_hdr is None:
624 return
625 if not os.path.isfile(filename):
626 return
627 timestr = last_modified_hdr
628 if timestr is None:
629 return
630 filetime = timeconvert(timestr)
631 if filetime is None:
632 return filetime
633 try:
634 os.utime(filename, (time.time(), filetime))
635 except:
636 pass
637 return filetime
638
639 def report_writedescription(self, descfn):
640 """ Report that the description file is being written """
641 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
642
643 def report_writeinfojson(self, infofn):
644 """ Report that the metadata file has been written """
645 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
646
647 def report_destination(self, filename):
648 """Report destination filename."""
649 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
650
651 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652 """Report download progress."""
653 if self.params.get('noprogress', False):
654 return
655 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
656 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
657 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
659
660 def report_resuming_byte(self, resume_len):
661 """Report attempt to resume at given byte."""
662 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
663
664 def report_retry(self, count, retries):
665 """Report retry in case of HTTP error 5xx"""
666 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
667
668 def report_file_already_downloaded(self, file_name):
669 """Report file has already been fully downloaded."""
670 try:
671 self.to_screen(u'[download] %s has already been downloaded' % file_name)
672 except (UnicodeEncodeError), err:
673 self.to_screen(u'[download] The file has already been downloaded')
674
675 def report_unable_to_resume(self):
676 """Report it was impossible to resume download."""
677 self.to_screen(u'[download] Unable to resume')
678
679 def report_finish(self):
680 """Report download finished."""
681 if self.params.get('noprogress', False):
682 self.to_screen(u'[download] Download completed')
683 else:
684 self.to_screen(u'')
685
686 def increment_downloads(self):
687 """Increment the ordinal that assigns a number to each file."""
688 self._num_downloads += 1
689
690 def prepare_filename(self, info_dict):
691 """Generate the output filename."""
692 try:
693 template_dict = dict(info_dict)
694 template_dict['epoch'] = unicode(long(time.time()))
695 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696 filename = self.params['outtmpl'] % template_dict
697 return filename
698 except (ValueError, KeyError), err:
699 self.trouble(u'ERROR: invalid system charset or erroneous output template')
700 return None
701
702 def _match_entry(self, info_dict):
703 """ Returns None iff the file should be downloaded """
704
705 title = info_dict['title']
706 matchtitle = self.params.get('matchtitle', False)
707 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
708 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
709 rejecttitle = self.params.get('rejecttitle', False)
710 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
711 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
712 return None
713
714 def process_info(self, info_dict):
715 """Process a single dictionary returned by an InfoExtractor."""
716
717 reason = self._match_entry(info_dict)
718 if reason is not None:
719 self.to_screen(u'[download] ' + reason)
720 return
721
722 max_downloads = self.params.get('max_downloads')
723 if max_downloads is not None:
724 if self._num_downloads > int(max_downloads):
725 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
726 return
727
728 filename = self.prepare_filename(info_dict)
729
730 # Forced printings
731 if self.params.get('forcetitle', False):
732 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
733 if self.params.get('forceurl', False):
734 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
735 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
736 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
737 if self.params.get('forcedescription', False) and 'description' in info_dict:
738 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
739 if self.params.get('forcefilename', False) and filename is not None:
740 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
741 if self.params.get('forceformat', False):
742 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
743
744 # Do nothing else if in simulate mode
745 if self.params.get('simulate', False):
746 return
747
748 if filename is None:
749 return
750
751 if self.params.get('nooverwrites', False) and os.path.exists(filename):
752 self.to_stderr(u'WARNING: file exists and will be skipped')
753 return
754
755 try:
756 dn = os.path.dirname(filename)
757 if dn != '' and not os.path.exists(dn):
758 os.makedirs(dn)
759 except (OSError, IOError), err:
760 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
761 return
762
763 if self.params.get('writedescription', False):
764 try:
765 descfn = filename + '.description'
766 self.report_writedescription(descfn)
767 descfile = open(descfn, 'wb')
768 try:
769 descfile.write(info_dict['description'].encode('utf-8'))
770 finally:
771 descfile.close()
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write description file ' + descfn)
774 return
775
776 if self.params.get('writeinfojson', False):
777 infofn = filename + '.info.json'
778 self.report_writeinfojson(infofn)
779 try:
780 json.dump
781 except (NameError,AttributeError):
782 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
783 return
784 try:
785 infof = open(infofn, 'wb')
786 try:
787 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
788 json.dump(json_info_dict, infof)
789 finally:
790 infof.close()
791 except (OSError, IOError):
792 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
793 return
794
795 if not self.params.get('skip_download', False):
796 try:
797 success = self._do_download(filename, info_dict)
798 except (OSError, IOError), err:
799 raise UnavailableVideoError
800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
801 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
802 return
803 except (ContentTooShortError, ), err:
804 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
805 return
806
807 if success:
808 try:
809 self.post_process(filename, info_dict)
810 except (PostProcessingError), err:
811 self.trouble(u'ERROR: postprocessing: %s' % str(err))
812 return
813
814 def download(self, url_list):
815 """Download a given list of URLs."""
816 if len(url_list) > 1 and self.fixed_template():
817 raise SameFileError(self.params['outtmpl'])
818
819 for url in url_list:
820 suitable_found = False
821 for ie in self._ies:
822 # Go to next InfoExtractor if not suitable
823 if not ie.suitable(url):
824 continue
825
826 # Suitable InfoExtractor found
827 suitable_found = True
828
829 # Extract information from URL and process it
830 ie.extract(url)
831
832 # Suitable InfoExtractor had been found; go to next URL
833 break
834
835 if not suitable_found:
836 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
837
838 return self._download_retcode
839
840 def post_process(self, filename, ie_info):
841 """Run the postprocessing chain on the given file."""
842 info = dict(ie_info)
843 info['filepath'] = filename
844 for pp in self._pps:
845 info = pp.run(info)
846 if info is None:
847 break
848
849 def _download_with_rtmpdump(self, filename, url, player_url):
850 self.report_destination(filename)
851 tmpfilename = self.temp_name(filename)
852
853 # Check for rtmpdump first
854 try:
855 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
856 except (OSError, IOError):
857 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
858 return False
859
860 # Download using rtmpdump. rtmpdump returns exit code 2 when
861 # the connection was interrumpted and resuming appears to be
862 # possible. This is part of rtmpdump's normal usage, AFAIK.
863 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
864 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
865 while retval == 2 or retval == 1:
866 prevsize = os.path.getsize(tmpfilename)
867 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
868 time.sleep(5.0) # This seems to be needed
869 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
870 cursize = os.path.getsize(tmpfilename)
871 if prevsize == cursize and retval == 1:
872 break
873 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
874 if prevsize == cursize and retval == 2 and cursize > 1024:
875 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
876 retval = 0
877 break
878 if retval == 0:
879 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
880 self.try_rename(tmpfilename, filename)
881 return True
882 else:
883 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
884 return False
885
886 def _do_download(self, filename, info_dict):
887 url = info_dict['url']
888 player_url = info_dict.get('player_url', None)
889
890 # Check file already present
891 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
892 self.report_file_already_downloaded(filename)
893 return True
894
895 # Attempt to download using rtmpdump
896 if url.startswith('rtmp'):
897 return self._download_with_rtmpdump(filename, url, player_url)
898
899 tmpfilename = self.temp_name(filename)
900 stream = None
901
902 # Do not include the Accept-Encoding header
903 headers = {'Youtubedl-no-compression': 'True'}
904 basic_request = urllib2.Request(url, None, headers)
905 request = urllib2.Request(url, None, headers)
906
907 # Establish possible resume length
908 if os.path.isfile(tmpfilename):
909 resume_len = os.path.getsize(tmpfilename)
910 else:
911 resume_len = 0
912
913 open_mode = 'wb'
914 if resume_len != 0:
915 if self.params.get('continuedl', False):
916 self.report_resuming_byte(resume_len)
917 request.add_header('Range','bytes=%d-' % resume_len)
918 open_mode = 'ab'
919 else:
920 resume_len = 0
921
922 count = 0
923 retries = self.params.get('retries', 0)
924 while count <= retries:
925 # Establish connection
926 try:
927 if count == 0 and 'urlhandle' in info_dict:
928 data = info_dict['urlhandle']
929 data = urllib2.urlopen(request)
930 break
931 except (urllib2.HTTPError, ), err:
932 if (err.code < 500 or err.code >= 600) and err.code != 416:
933 # Unexpected HTTP error
934 raise
935 elif err.code == 416:
936 # Unable to resume (requested range not satisfiable)
937 try:
938 # Open the connection again without the range header
939 data = urllib2.urlopen(basic_request)
940 content_length = data.info()['Content-Length']
941 except (urllib2.HTTPError, ), err:
942 if err.code < 500 or err.code >= 600:
943 raise
944 else:
945 # Examine the reported length
946 if (content_length is not None and
947 (resume_len - 100 < long(content_length) < resume_len + 100)):
948 # The file had already been fully downloaded.
949 # Explanation to the above condition: in issue #175 it was revealed that
950 # YouTube sometimes adds or removes a few bytes from the end of the file,
951 # changing the file size slightly and causing problems for some users. So
952 # I decided to implement a suggested change and consider the file
953 # completely downloaded if the file size differs less than 100 bytes from
954 # the one in the hard drive.
955 self.report_file_already_downloaded(filename)
956 self.try_rename(tmpfilename, filename)
957 return True
958 else:
959 # The length does not match, we start the download over
960 self.report_unable_to_resume()
961 open_mode = 'wb'
962 break
963 # Retry
964 count += 1
965 if count <= retries:
966 self.report_retry(count, retries)
967
968 if count > retries:
969 self.trouble(u'ERROR: giving up after %s retries' % retries)
970 return False
971
972 data_len = data.info().get('Content-length', None)
973 if data_len is not None:
974 data_len = long(data_len) + resume_len
975 data_len_str = self.format_bytes(data_len)
976 byte_counter = 0 + resume_len
977 block_size = 1024
978 start = time.time()
979 while True:
980 # Download and write
981 before = time.time()
982 data_block = data.read(block_size)
983 after = time.time()
984 if len(data_block) == 0:
985 break
986 byte_counter += len(data_block)
987
988 # Open file just in time
989 if stream is None:
990 try:
991 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
992 assert stream is not None
993 filename = self.undo_temp_name(tmpfilename)
994 self.report_destination(filename)
995 except (OSError, IOError), err:
996 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
997 return False
998 try:
999 stream.write(data_block)
1000 except (IOError, OSError), err:
1001 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1002 return False
1003 block_size = self.best_block_size(after - before, len(data_block))
1004
1005 # Progress message
1006 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1007 if data_len is None:
1008 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1009 else:
1010 percent_str = self.calc_percent(byte_counter, data_len)
1011 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1012 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1013
1014 # Apply rate limit
1015 self.slow_down(start, byte_counter - resume_len)
1016
1017 if stream is None:
1018 self.trouble(u'\nERROR: Did not get any data blocks')
1019 return False
1020 stream.close()
1021 self.report_finish()
1022 if data_len is not None and byte_counter != data_len:
1023 raise ContentTooShortError(byte_counter, long(data_len))
1024 self.try_rename(tmpfilename, filename)
1025
1026 # Update file modification time
1027 if self.params.get('updatetime', True):
1028 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1029
1030 return True
1031
1032
1033 class InfoExtractor(object):
1034 """Information Extractor class.
1035
1036 Information extractors are the classes that, given a URL, extract
1037 information from the video (or videos) the URL refers to. This
1038 information includes the real video URL, the video title and simplified
1039 title, author and others. The information is stored in a dictionary
1040 which is then passed to the FileDownloader. The FileDownloader
1041 processes this information possibly downloading the video to the file
1042 system, among other possible outcomes. The dictionaries must include
1043 the following fields:
1044
1045 id: Video identifier.
1046 url: Final video URL.
1047 uploader: Nickname of the video uploader.
1048 title: Literal title.
1049 stitle: Simplified title.
1050 ext: Video filename extension.
1051 format: Video format.
1052 player_url: SWF Player URL (may be None).
1053
1054 The following fields are optional. Their primary purpose is to allow
1055 youtube-dl to serve as the backend for a video search function, such
1056 as the one in youtube2mp3. They are only used when their respective
1057 forced printing functions are called:
1058
1059 thumbnail: Full URL to a video thumbnail image.
1060 description: One-line video description.
1061
1062 Subclasses of this one should re-define the _real_initialize() and
1063 _real_extract() methods and define a _VALID_URL regexp.
1064 Probably, they should also be added to the list of extractors.
1065 """
1066
1067 _ready = False
1068 _downloader = None
1069
1070 def __init__(self, downloader=None):
1071 """Constructor. Receives an optional downloader."""
1072 self._ready = False
1073 self.set_downloader(downloader)
1074
1075 def suitable(self, url):
1076 """Receives a URL and returns True if suitable for this IE."""
1077 return re.match(self._VALID_URL, url) is not None
1078
1079 def initialize(self):
1080 """Initializes an instance (authentication, etc)."""
1081 if not self._ready:
1082 self._real_initialize()
1083 self._ready = True
1084
1085 def extract(self, url):
1086 """Extracts URL information and returns it in list of dicts."""
1087 self.initialize()
1088 return self._real_extract(url)
1089
1090 def set_downloader(self, downloader):
1091 """Sets the downloader for this IE."""
1092 self._downloader = downloader
1093
1094 def _real_initialize(self):
1095 """Real initialization process. Redefine in subclasses."""
1096 pass
1097
1098 def _real_extract(self, url):
1099 """Real extraction process. Redefine in subclasses."""
1100 pass
1101
1102
1103 class YoutubeIE(InfoExtractor):
1104 """Information extractor for youtube.com."""
1105
1106 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1107 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1108 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1109 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1110 _NETRC_MACHINE = 'youtube'
1111 # Listed in order of quality
1112 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1113 _video_extensions = {
1114 '13': '3gp',
1115 '17': 'mp4',
1116 '18': 'mp4',
1117 '22': 'mp4',
1118 '37': 'mp4',
1119 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1120 '43': 'webm',
1121 '44': 'webm',
1122 '45': 'webm',
1123 }
1124 _video_dimensions = {
1125 '5': '240x400',
1126 '6': '???',
1127 '13': '???',
1128 '17': '144x176',
1129 '18': '360x640',
1130 '22': '720x1280',
1131 '34': '360x640',
1132 '35': '480x854',
1133 '37': '1080x1920',
1134 '38': '3072x4096',
1135 '43': '360x640',
1136 '44': '480x854',
1137 '45': '720x1280',
1138 }
1139 IE_NAME = u'youtube'
1140
1141 def report_lang(self):
1142 """Report attempt to set language."""
1143 self._downloader.to_screen(u'[youtube] Setting language')
1144
1145 def report_login(self):
1146 """Report attempt to log in."""
1147 self._downloader.to_screen(u'[youtube] Logging in')
1148
1149 def report_age_confirmation(self):
1150 """Report attempt to confirm age."""
1151 self._downloader.to_screen(u'[youtube] Confirming age')
1152
1153 def report_video_webpage_download(self, video_id):
1154 """Report attempt to download video webpage."""
1155 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1156
1157 def report_video_info_webpage_download(self, video_id):
1158 """Report attempt to download video info webpage."""
1159 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1160
1161 def report_information_extraction(self, video_id):
1162 """Report attempt to extract video information."""
1163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1164
1165 def report_unavailable_format(self, video_id, format):
1166 """Report extracted video URL."""
1167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1168
1169 def report_rtmp_download(self):
1170 """Indicate the download will use the RTMP protocol."""
1171 self._downloader.to_screen(u'[youtube] RTMP download detected')
1172
1173 def _print_formats(self, formats):
1174 print 'Available formats:'
1175 for x in formats:
1176 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1177
1178 def _real_initialize(self):
1179 if self._downloader is None:
1180 return
1181
1182 username = None
1183 password = None
1184 downloader_params = self._downloader.params
1185
1186 # Attempt to use provided username and password or .netrc data
1187 if downloader_params.get('username', None) is not None:
1188 username = downloader_params['username']
1189 password = downloader_params['password']
1190 elif downloader_params.get('usenetrc', False):
1191 try:
1192 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1193 if info is not None:
1194 username = info[0]
1195 password = info[2]
1196 else:
1197 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1198 except (IOError, netrc.NetrcParseError), err:
1199 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1200 return
1201
1202 # Set language
1203 request = urllib2.Request(self._LANG_URL)
1204 try:
1205 self.report_lang()
1206 urllib2.urlopen(request).read()
1207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1208 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1209 return
1210
1211 # No authentication to be performed
1212 if username is None:
1213 return
1214
1215 # Log in
1216 login_form = {
1217 'current_form': 'loginForm',
1218 'next': '/',
1219 'action_login': 'Log In',
1220 'username': username,
1221 'password': password,
1222 }
1223 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1224 try:
1225 self.report_login()
1226 login_results = urllib2.urlopen(request).read()
1227 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1228 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1229 return
1230 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1231 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1232 return
1233
1234 # Confirm age
1235 age_form = {
1236 'next_url': '/',
1237 'action_confirm': 'Confirm',
1238 }
1239 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1240 try:
1241 self.report_age_confirmation()
1242 age_results = urllib2.urlopen(request).read()
1243 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1244 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1245 return
1246
1247 def _real_extract(self, url):
1248 # Extract video id from URL
1249 mobj = re.match(self._VALID_URL, url)
1250 if mobj is None:
1251 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1252 return
1253 video_id = mobj.group(2)
1254
1255 # Get video webpage
1256 self.report_video_webpage_download(video_id)
1257 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1258 try:
1259 video_webpage = urllib2.urlopen(request).read()
1260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1261 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1262 return
1263
1264 # Attempt to extract SWF player URL
1265 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1266 if mobj is not None:
1267 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1268 else:
1269 player_url = None
1270
1271 # Get video info
1272 self.report_video_info_webpage_download(video_id)
1273 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1274 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1275 % (video_id, el_type))
1276 request = urllib2.Request(video_info_url)
1277 try:
1278 video_info_webpage = urllib2.urlopen(request).read()
1279 video_info = parse_qs(video_info_webpage)
1280 if 'token' in video_info:
1281 break
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1284 return
1285 if 'token' not in video_info:
1286 if 'reason' in video_info:
1287 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1288 else:
1289 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1290 return
1291
1292 # Start extracting information
1293 self.report_information_extraction(video_id)
1294
1295 # uploader
1296 if 'author' not in video_info:
1297 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1298 return
1299 video_uploader = urllib.unquote_plus(video_info['author'][0])
1300
1301 # title
1302 if 'title' not in video_info:
1303 self._downloader.trouble(u'ERROR: unable to extract video title')
1304 return
1305 video_title = urllib.unquote_plus(video_info['title'][0])
1306 video_title = video_title.decode('utf-8')
1307 video_title = sanitize_title(video_title)
1308
1309 # simplified title
1310 simple_title = _simplify_title(video_title)
1311
1312 # thumbnail image
1313 if 'thumbnail_url' not in video_info:
1314 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1315 video_thumbnail = ''
1316 else: # don't panic if we can't find it
1317 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1318
1319 # upload date
1320 upload_date = u'NA'
1321 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1322 if mobj is not None:
1323 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1324 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1325 for expression in format_expressions:
1326 try:
1327 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1328 except:
1329 pass
1330
1331 # description
1332 try:
1333 lxml.etree
1334 except NameError:
1335 video_description = u'No description available.'
1336 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1337 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1338 if mobj is not None:
1339 video_description = mobj.group(1).decode('utf-8')
1340 else:
1341 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1342 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1343 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1344 # TODO use another parser
1345
1346 # token
1347 video_token = urllib.unquote_plus(video_info['token'][0])
1348
1349 # Decide which formats to download
1350 req_format = self._downloader.params.get('format', None)
1351
1352 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1353 self.report_rtmp_download()
1354 video_url_list = [(None, video_info['conn'][0])]
1355 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1356 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1357 url_data = [parse_qs(uds) for uds in url_data_strs]
1358 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1359 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1360
1361 format_limit = self._downloader.params.get('format_limit', None)
1362 if format_limit is not None and format_limit in self._available_formats:
1363 format_list = self._available_formats[self._available_formats.index(format_limit):]
1364 else:
1365 format_list = self._available_formats
1366 existing_formats = [x for x in format_list if x in url_map]
1367 if len(existing_formats) == 0:
1368 self._downloader.trouble(u'ERROR: no known formats available for video')
1369 return
1370 if self._downloader.params.get('listformats', None):
1371 self._print_formats(existing_formats)
1372 return
1373 if req_format is None or req_format == 'best':
1374 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1375 elif req_format == 'worst':
1376 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1377 elif req_format in ('-1', 'all'):
1378 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1379 else:
1380 # Specific formats. We pick the first in a slash-delimeted sequence.
1381 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1382 req_formats = req_format.split('/')
1383 video_url_list = None
1384 for rf in req_formats:
1385 if rf in url_map:
1386 video_url_list = [(rf, url_map[rf])]
1387 break
1388 if video_url_list is None:
1389 self._downloader.trouble(u'ERROR: requested format not available')
1390 return
1391 else:
1392 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1393 return
1394
1395 for format_param, video_real_url in video_url_list:
1396 # At this point we have a new video
1397 self._downloader.increment_downloads()
1398
1399 # Extension
1400 video_extension = self._video_extensions.get(format_param, 'flv')
1401
1402 try:
1403 # Process video information
1404 self._downloader.process_info({
1405 'id': video_id.decode('utf-8'),
1406 'url': video_real_url.decode('utf-8'),
1407 'uploader': video_uploader.decode('utf-8'),
1408 'upload_date': upload_date,
1409 'title': video_title,
1410 'stitle': simple_title,
1411 'ext': video_extension.decode('utf-8'),
1412 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1413 'thumbnail': video_thumbnail.decode('utf-8'),
1414 'description': video_description,
1415 'player_url': player_url,
1416 })
1417 except UnavailableVideoError, err:
1418 self._downloader.trouble(u'\nERROR: unable to download video')
1419
1420
1421 class MetacafeIE(InfoExtractor):
1422 """Information Extractor for metacafe.com."""
1423
1424 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1425 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1426 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1427 _youtube_ie = None
1428 IE_NAME = u'metacafe'
1429
1430 def __init__(self, youtube_ie, downloader=None):
1431 InfoExtractor.__init__(self, downloader)
1432 self._youtube_ie = youtube_ie
1433
1434 def report_disclaimer(self):
1435 """Report disclaimer retrieval."""
1436 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1437
1438 def report_age_confirmation(self):
1439 """Report attempt to confirm age."""
1440 self._downloader.to_screen(u'[metacafe] Confirming age')
1441
1442 def report_download_webpage(self, video_id):
1443 """Report webpage download."""
1444 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1445
1446 def report_extraction(self, video_id):
1447 """Report information extraction."""
1448 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1449
1450 def _real_initialize(self):
1451 # Retrieve disclaimer
1452 request = urllib2.Request(self._DISCLAIMER)
1453 try:
1454 self.report_disclaimer()
1455 disclaimer = urllib2.urlopen(request).read()
1456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1458 return
1459
1460 # Confirm age
1461 disclaimer_form = {
1462 'filters': '0',
1463 'submit': "Continue - I'm over 18",
1464 }
1465 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1466 try:
1467 self.report_age_confirmation()
1468 disclaimer = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1471 return
1472
1473 def _real_extract(self, url):
1474 # Extract id and simplified title from URL
1475 mobj = re.match(self._VALID_URL, url)
1476 if mobj is None:
1477 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1478 return
1479
1480 video_id = mobj.group(1)
1481
1482 # Check if video comes from YouTube
1483 mobj2 = re.match(r'^yt-(.*)$', video_id)
1484 if mobj2 is not None:
1485 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1486 return
1487
1488 # At this point we have a new video
1489 self._downloader.increment_downloads()
1490
1491 simple_title = mobj.group(2).decode('utf-8')
1492
1493 # Retrieve video webpage to extract further information
1494 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1495 try:
1496 self.report_download_webpage(video_id)
1497 webpage = urllib2.urlopen(request).read()
1498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1499 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1500 return
1501
1502 # Extract URL, uploader and title from webpage
1503 self.report_extraction(video_id)
1504 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1505 if mobj is not None:
1506 mediaURL = urllib.unquote(mobj.group(1))
1507 video_extension = mediaURL[-3:]
1508
1509 # Extract gdaKey if available
1510 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1511 if mobj is None:
1512 video_url = mediaURL
1513 else:
1514 gdaKey = mobj.group(1)
1515 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1516 else:
1517 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1518 if mobj is None:
1519 self._downloader.trouble(u'ERROR: unable to extract media URL')
1520 return
1521 vardict = parse_qs(mobj.group(1))
1522 if 'mediaData' not in vardict:
1523 self._downloader.trouble(u'ERROR: unable to extract media URL')
1524 return
1525 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1526 if mobj is None:
1527 self._downloader.trouble(u'ERROR: unable to extract media URL')
1528 return
1529 mediaURL = mobj.group(1).replace('\\/', '/')
1530 video_extension = mediaURL[-3:]
1531 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1532
1533 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1534 if mobj is None:
1535 self._downloader.trouble(u'ERROR: unable to extract title')
1536 return
1537 video_title = mobj.group(1).decode('utf-8')
1538 video_title = sanitize_title(video_title)
1539
1540 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1541 if mobj is None:
1542 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1543 return
1544 video_uploader = mobj.group(1)
1545
1546 try:
1547 # Process video information
1548 self._downloader.process_info({
1549 'id': video_id.decode('utf-8'),
1550 'url': video_url.decode('utf-8'),
1551 'uploader': video_uploader.decode('utf-8'),
1552 'upload_date': u'NA',
1553 'title': video_title,
1554 'stitle': simple_title,
1555 'ext': video_extension.decode('utf-8'),
1556 'format': u'NA',
1557 'player_url': None,
1558 })
1559 except UnavailableVideoError:
1560 self._downloader.trouble(u'\nERROR: unable to download video')
1561
1562
1563 class DailymotionIE(InfoExtractor):
1564 """Information Extractor for Dailymotion"""
1565
1566 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1567 IE_NAME = u'dailymotion'
1568
1569 def __init__(self, downloader=None):
1570 InfoExtractor.__init__(self, downloader)
1571
1572 def report_download_webpage(self, video_id):
1573 """Report webpage download."""
1574 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1575
1576 def report_extraction(self, video_id):
1577 """Report information extraction."""
1578 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1579
1580 def _real_extract(self, url):
1581 # Extract id and simplified title from URL
1582 mobj = re.match(self._VALID_URL, url)
1583 if mobj is None:
1584 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1585 return
1586
1587 # At this point we have a new video
1588 self._downloader.increment_downloads()
1589 video_id = mobj.group(1)
1590
1591 simple_title = mobj.group(2).decode('utf-8')
1592 video_extension = 'flv'
1593
1594 # Retrieve video webpage to extract further information
1595 request = urllib2.Request(url)
1596 request.add_header('Cookie', 'family_filter=off')
1597 try:
1598 self.report_download_webpage(video_id)
1599 webpage = urllib2.urlopen(request).read()
1600 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1602 return
1603
1604 # Extract URL, uploader and title from webpage
1605 self.report_extraction(video_id)
1606 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1607 if mobj is None:
1608 self._downloader.trouble(u'ERROR: unable to extract media URL')
1609 return
1610 sequence = urllib.unquote(mobj.group(1))
1611 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1612 if mobj is None:
1613 self._downloader.trouble(u'ERROR: unable to extract media URL')
1614 return
1615 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1616
1617 # if needed add http://www.dailymotion.com/ if relative URL
1618
1619 video_url = mediaURL
1620
1621 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1622 if mobj is None:
1623 self._downloader.trouble(u'ERROR: unable to extract title')
1624 return
1625 video_title = mobj.group(1).decode('utf-8')
1626 video_title = sanitize_title(video_title)
1627
1628 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1629 if mobj is None:
1630 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1631 return
1632 video_uploader = mobj.group(1)
1633
1634 try:
1635 # Process video information
1636 self._downloader.process_info({
1637 'id': video_id.decode('utf-8'),
1638 'url': video_url.decode('utf-8'),
1639 'uploader': video_uploader.decode('utf-8'),
1640 'upload_date': u'NA',
1641 'title': video_title,
1642 'stitle': simple_title,
1643 'ext': video_extension.decode('utf-8'),
1644 'format': u'NA',
1645 'player_url': None,
1646 })
1647 except UnavailableVideoError:
1648 self._downloader.trouble(u'\nERROR: unable to download video')
1649
1650
1651 class GoogleIE(InfoExtractor):
1652 """Information extractor for video.google.com."""
1653
1654 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1655 IE_NAME = u'video.google'
1656
1657 def __init__(self, downloader=None):
1658 InfoExtractor.__init__(self, downloader)
1659
1660 def report_download_webpage(self, video_id):
1661 """Report webpage download."""
1662 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1663
1664 def report_extraction(self, video_id):
1665 """Report information extraction."""
1666 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1667
1668 def _real_extract(self, url):
1669 # Extract id from URL
1670 mobj = re.match(self._VALID_URL, url)
1671 if mobj is None:
1672 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1673 return
1674
1675 # At this point we have a new video
1676 self._downloader.increment_downloads()
1677 video_id = mobj.group(1)
1678
1679 video_extension = 'mp4'
1680
1681 # Retrieve video webpage to extract further information
1682 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1683 try:
1684 self.report_download_webpage(video_id)
1685 webpage = urllib2.urlopen(request).read()
1686 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1687 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1688 return
1689
1690 # Extract URL, uploader, and title from webpage
1691 self.report_extraction(video_id)
1692 mobj = re.search(r"download_url:'([^']+)'", webpage)
1693 if mobj is None:
1694 video_extension = 'flv'
1695 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1696 if mobj is None:
1697 self._downloader.trouble(u'ERROR: unable to extract media URL')
1698 return
1699 mediaURL = urllib.unquote(mobj.group(1))
1700 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1701 mediaURL = mediaURL.replace('\\x26', '\x26')
1702
1703 video_url = mediaURL
1704
1705 mobj = re.search(r'<title>(.*)</title>', webpage)
1706 if mobj is None:
1707 self._downloader.trouble(u'ERROR: unable to extract title')
1708 return
1709 video_title = mobj.group(1).decode('utf-8')
1710 video_title = sanitize_title(video_title)
1711 simple_title = _simplify_title(video_title)
1712
1713 # Extract video description
1714 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1715 if mobj is None:
1716 self._downloader.trouble(u'ERROR: unable to extract video description')
1717 return
1718 video_description = mobj.group(1).decode('utf-8')
1719 if not video_description:
1720 video_description = 'No description available.'
1721
1722 # Extract video thumbnail
1723 if self._downloader.params.get('forcethumbnail', False):
1724 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1725 try:
1726 webpage = urllib2.urlopen(request).read()
1727 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1729 return
1730 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1731 if mobj is None:
1732 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1733 return
1734 video_thumbnail = mobj.group(1)
1735 else: # we need something to pass to process_info
1736 video_thumbnail = ''
1737
1738 try:
1739 # Process video information
1740 self._downloader.process_info({
1741 'id': video_id.decode('utf-8'),
1742 'url': video_url.decode('utf-8'),
1743 'uploader': u'NA',
1744 'upload_date': u'NA',
1745 'title': video_title,
1746 'stitle': simple_title,
1747 'ext': video_extension.decode('utf-8'),
1748 'format': u'NA',
1749 'player_url': None,
1750 })
1751 except UnavailableVideoError:
1752 self._downloader.trouble(u'\nERROR: unable to download video')
1753
1754
1755 class PhotobucketIE(InfoExtractor):
1756 """Information extractor for photobucket.com."""
1757
1758 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1759 IE_NAME = u'photobucket'
1760
1761 def __init__(self, downloader=None):
1762 InfoExtractor.__init__(self, downloader)
1763
1764 def report_download_webpage(self, video_id):
1765 """Report webpage download."""
1766 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1767
1768 def report_extraction(self, video_id):
1769 """Report information extraction."""
1770 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1771
1772 def _real_extract(self, url):
1773 # Extract id from URL
1774 mobj = re.match(self._VALID_URL, url)
1775 if mobj is None:
1776 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1777 return
1778
1779 # At this point we have a new video
1780 self._downloader.increment_downloads()
1781 video_id = mobj.group(1)
1782
1783 video_extension = 'flv'
1784
1785 # Retrieve video webpage to extract further information
1786 request = urllib2.Request(url)
1787 try:
1788 self.report_download_webpage(video_id)
1789 webpage = urllib2.urlopen(request).read()
1790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1792 return
1793
1794 # Extract URL, uploader, and title from webpage
1795 self.report_extraction(video_id)
1796 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1797 if mobj is None:
1798 self._downloader.trouble(u'ERROR: unable to extract media URL')
1799 return
1800 mediaURL = urllib.unquote(mobj.group(1))
1801
1802 video_url = mediaURL
1803
1804 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1805 if mobj is None:
1806 self._downloader.trouble(u'ERROR: unable to extract title')
1807 return
1808 video_title = mobj.group(1).decode('utf-8')
1809 video_title = sanitize_title(video_title)
1810 simple_title = _simplify_title(vide_title)
1811
1812 video_uploader = mobj.group(2).decode('utf-8')
1813
1814 try:
1815 # Process video information
1816 self._downloader.process_info({
1817 'id': video_id.decode('utf-8'),
1818 'url': video_url.decode('utf-8'),
1819 'uploader': video_uploader,
1820 'upload_date': u'NA',
1821 'title': video_title,
1822 'stitle': simple_title,
1823 'ext': video_extension.decode('utf-8'),
1824 'format': u'NA',
1825 'player_url': None,
1826 })
1827 except UnavailableVideoError:
1828 self._downloader.trouble(u'\nERROR: unable to download video')
1829
1830
1831 class YahooIE(InfoExtractor):
1832 """Information extractor for video.yahoo.com."""
1833
1834 # _VALID_URL matches all Yahoo! Video URLs
1835 # _VPAGE_URL matches only the extractable '/watch/' URLs
1836 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1837 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1838 IE_NAME = u'video.yahoo'
1839
1840 def __init__(self, downloader=None):
1841 InfoExtractor.__init__(self, downloader)
1842
1843 def report_download_webpage(self, video_id):
1844 """Report webpage download."""
1845 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1846
1847 def report_extraction(self, video_id):
1848 """Report information extraction."""
1849 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1850
1851 def _real_extract(self, url, new_video=True):
1852 # Extract ID from URL
1853 mobj = re.match(self._VALID_URL, url)
1854 if mobj is None:
1855 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1856 return
1857
1858 # At this point we have a new video
1859 self._downloader.increment_downloads()
1860 video_id = mobj.group(2)
1861 video_extension = 'flv'
1862
1863 # Rewrite valid but non-extractable URLs as
1864 # extractable English language /watch/ URLs
1865 if re.match(self._VPAGE_URL, url) is None:
1866 request = urllib2.Request(url)
1867 try:
1868 webpage = urllib2.urlopen(request).read()
1869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1871 return
1872
1873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1874 if mobj is None:
1875 self._downloader.trouble(u'ERROR: Unable to extract id field')
1876 return
1877 yahoo_id = mobj.group(1)
1878
1879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1880 if mobj is None:
1881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1882 return
1883 yahoo_vid = mobj.group(1)
1884
1885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1886 return self._real_extract(url, new_video=False)
1887
1888 # Retrieve video webpage to extract further information
1889 request = urllib2.Request(url)
1890 try:
1891 self.report_download_webpage(video_id)
1892 webpage = urllib2.urlopen(request).read()
1893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1895 return
1896
1897 # Extract uploader and title from webpage
1898 self.report_extraction(video_id)
1899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1900 if mobj is None:
1901 self._downloader.trouble(u'ERROR: unable to extract video title')
1902 return
1903 video_title = mobj.group(1).decode('utf-8')
1904 simple_title = _simplify_title(video_title)
1905
1906 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1907 if mobj is None:
1908 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1909 return
1910 video_uploader = mobj.group(1).decode('utf-8')
1911
1912 # Extract video thumbnail
1913 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1916 return
1917 video_thumbnail = mobj.group(1).decode('utf-8')
1918
1919 # Extract video description
1920 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1921 if mobj is None:
1922 self._downloader.trouble(u'ERROR: unable to extract video description')
1923 return
1924 video_description = mobj.group(1).decode('utf-8')
1925 if not video_description:
1926 video_description = 'No description available.'
1927
1928 # Extract video height and width
1929 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1930 if mobj is None:
1931 self._downloader.trouble(u'ERROR: unable to extract video height')
1932 return
1933 yv_video_height = mobj.group(1)
1934
1935 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1936 if mobj is None:
1937 self._downloader.trouble(u'ERROR: unable to extract video width')
1938 return
1939 yv_video_width = mobj.group(1)
1940
1941 # Retrieve video playlist to extract media URL
1942 # I'm not completely sure what all these options are, but we
1943 # seem to need most of them, otherwise the server sends a 401.
1944 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1945 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1946 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1947 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1948 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1949 try:
1950 self.report_download_webpage(video_id)
1951 webpage = urllib2.urlopen(request).read()
1952 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1953 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1954 return
1955
1956 # Extract media URL from playlist XML
1957 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1958 if mobj is None:
1959 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1960 return
1961 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1962 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1963
1964 try:
1965 # Process video information
1966 self._downloader.process_info({
1967 'id': video_id.decode('utf-8'),
1968 'url': video_url,
1969 'uploader': video_uploader,
1970 'upload_date': u'NA',
1971 'title': video_title,
1972 'stitle': simple_title,
1973 'ext': video_extension.decode('utf-8'),
1974 'thumbnail': video_thumbnail.decode('utf-8'),
1975 'description': video_description,
1976 'thumbnail': video_thumbnail,
1977 'player_url': None,
1978 })
1979 except UnavailableVideoError:
1980 self._downloader.trouble(u'\nERROR: unable to download video')
1981
1982
1983 class VimeoIE(InfoExtractor):
1984 """Information extractor for vimeo.com."""
1985
1986 # _VALID_URL matches Vimeo URLs
1987 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1988 IE_NAME = u'vimeo'
1989
1990 def __init__(self, downloader=None):
1991 InfoExtractor.__init__(self, downloader)
1992
1993 def report_download_webpage(self, video_id):
1994 """Report webpage download."""
1995 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1996
1997 def report_extraction(self, video_id):
1998 """Report information extraction."""
1999 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2000
2001 def _real_extract(self, url, new_video=True):
2002 # Extract ID from URL
2003 mobj = re.match(self._VALID_URL, url)
2004 if mobj is None:
2005 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2006 return
2007
2008 # At this point we have a new video
2009 self._downloader.increment_downloads()
2010 video_id = mobj.group(1)
2011
2012 # Retrieve video webpage to extract further information
2013 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2014 try:
2015 self.report_download_webpage(video_id)
2016 webpage = urllib2.urlopen(request).read()
2017 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019 return
2020
2021 # Now we begin extracting as much information as we can from what we
2022 # retrieved. First we extract the information common to all extractors,
2023 # and latter we extract those that are Vimeo specific.
2024 self.report_extraction(video_id)
2025
2026 # Extract title
2027 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2028 if mobj is None:
2029 self._downloader.trouble(u'ERROR: unable to extract video title')
2030 return
2031 video_title = mobj.group(1).decode('utf-8')
2032 simple_title = _simplify_title(video_title)
2033
2034 # Extract uploader
2035 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2036 if mobj is None:
2037 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038 return
2039 video_uploader = mobj.group(1).decode('utf-8')
2040
2041 # Extract video thumbnail
2042 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2043 if mobj is None:
2044 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045 return
2046 video_thumbnail = mobj.group(1).decode('utf-8')
2047
2048 # # Extract video description
2049 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2050 # if mobj is None:
2051 # self._downloader.trouble(u'ERROR: unable to extract video description')
2052 # return
2053 # video_description = mobj.group(1).decode('utf-8')
2054 # if not video_description: video_description = 'No description available.'
2055 video_description = 'Foo.'
2056
2057 # Vimeo specific: extract request signature
2058 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2059 if mobj is None:
2060 self._downloader.trouble(u'ERROR: unable to extract request signature')
2061 return
2062 sig = mobj.group(1).decode('utf-8')
2063
2064 # Vimeo specific: extract video quality information
2065 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2066 if mobj is None:
2067 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2068 return
2069 quality = mobj.group(1).decode('utf-8')
2070
2071 if int(quality) == 1:
2072 quality = 'hd'
2073 else:
2074 quality = 'sd'
2075
2076 # Vimeo specific: Extract request signature expiration
2077 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2078 if mobj is None:
2079 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2080 return
2081 sig_exp = mobj.group(1).decode('utf-8')
2082
2083 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2084
2085 try:
2086 # Process video information
2087 self._downloader.process_info({
2088 'id': video_id.decode('utf-8'),
2089 'url': video_url,
2090 'uploader': video_uploader,
2091 'upload_date': u'NA',
2092 'title': video_title,
2093 'stitle': simple_title,
2094 'ext': u'mp4',
2095 'thumbnail': video_thumbnail.decode('utf-8'),
2096 'description': video_description,
2097 'thumbnail': video_thumbnail,
2098 'description': video_description,
2099 'player_url': None,
2100 })
2101 except UnavailableVideoError:
2102 self._downloader.trouble(u'ERROR: unable to download video')
2103
2104
2105 class GenericIE(InfoExtractor):
2106 """Generic last-resort information extractor."""
2107
2108 _VALID_URL = r'.*'
2109 IE_NAME = u'generic'
2110
2111 def __init__(self, downloader=None):
2112 InfoExtractor.__init__(self, downloader)
2113
2114 def report_download_webpage(self, video_id):
2115 """Report webpage download."""
2116 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2117 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2118
2119 def report_extraction(self, video_id):
2120 """Report information extraction."""
2121 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2122
2123 def _real_extract(self, url):
2124 # At this point we have a new video
2125 self._downloader.increment_downloads()
2126
2127 video_id = url.split('/')[-1]
2128 request = urllib2.Request(url)
2129 try:
2130 self.report_download_webpage(video_id)
2131 webpage = urllib2.urlopen(request).read()
2132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2134 return
2135 except ValueError, err:
2136 # since this is the last-resort InfoExtractor, if
2137 # this error is thrown, it'll be thrown here
2138 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2139 return
2140
2141 self.report_extraction(video_id)
2142 # Start with something easy: JW Player in SWFObject
2143 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2144 if mobj is None:
2145 # Broaden the search a little bit
2146 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2147 if mobj is None:
2148 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2149 return
2150
2151 # It's possible that one of the regexes
2152 # matched, but returned an empty group:
2153 if mobj.group(1) is None:
2154 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2155 return
2156
2157 video_url = urllib.unquote(mobj.group(1))
2158 video_id = os.path.basename(video_url)
2159
2160 # here's a fun little line of code for you:
2161 video_extension = os.path.splitext(video_id)[1][1:]
2162 video_id = os.path.splitext(video_id)[0]
2163
2164 # it's tempting to parse this further, but you would
2165 # have to take into account all the variations like
2166 # Video Title - Site Name
2167 # Site Name | Video Title
2168 # Video Title - Tagline | Site Name
2169 # and so on and so forth; it's just not practical
2170 mobj = re.search(r'<title>(.*)</title>', webpage)
2171 if mobj is None:
2172 self._downloader.trouble(u'ERROR: unable to extract title')
2173 return
2174 video_title = mobj.group(1).decode('utf-8')
2175 video_title = sanitize_title(video_title)
2176 simple_title = _simplify_title(video_title)
2177
2178 # video uploader is domain name
2179 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2180 if mobj is None:
2181 self._downloader.trouble(u'ERROR: unable to extract title')
2182 return
2183 video_uploader = mobj.group(1).decode('utf-8')
2184
2185 try:
2186 # Process video information
2187 self._downloader.process_info({
2188 'id': video_id.decode('utf-8'),
2189 'url': video_url.decode('utf-8'),
2190 'uploader': video_uploader,
2191 'upload_date': u'NA',
2192 'title': video_title,
2193 'stitle': simple_title,
2194 'ext': video_extension.decode('utf-8'),
2195 'format': u'NA',
2196 'player_url': None,
2197 })
2198 except UnavailableVideoError, err:
2199 self._downloader.trouble(u'\nERROR: unable to download video')
2200
2201
2202 class YoutubeSearchIE(InfoExtractor):
2203 """Information Extractor for YouTube search queries."""
2204 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2205 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2206 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2207 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2208 _youtube_ie = None
2209 _max_youtube_results = 1000
2210 IE_NAME = u'youtube:search'
2211
2212 def __init__(self, youtube_ie, downloader=None):
2213 InfoExtractor.__init__(self, downloader)
2214 self._youtube_ie = youtube_ie
2215
2216 def report_download_page(self, query, pagenum):
2217 """Report attempt to download playlist page with given number."""
2218 query = query.decode(preferredencoding())
2219 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2220
2221 def _real_initialize(self):
2222 self._youtube_ie.initialize()
2223
2224 def _real_extract(self, query):
2225 mobj = re.match(self._VALID_URL, query)
2226 if mobj is None:
2227 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2228 return
2229
2230 prefix, query = query.split(':')
2231 prefix = prefix[8:]
2232 query = query.encode('utf-8')
2233 if prefix == '':
2234 self._download_n_results(query, 1)
2235 return
2236 elif prefix == 'all':
2237 self._download_n_results(query, self._max_youtube_results)
2238 return
2239 else:
2240 try:
2241 n = long(prefix)
2242 if n <= 0:
2243 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2244 return
2245 elif n > self._max_youtube_results:
2246 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2247 n = self._max_youtube_results
2248 self._download_n_results(query, n)
2249 return
2250 except ValueError: # parsing prefix as integer fails
2251 self._download_n_results(query, 1)
2252 return
2253
2254 def _download_n_results(self, query, n):
2255 """Downloads a specified number of results for a query"""
2256
2257 video_ids = []
2258 already_seen = set()
2259 pagenum = 1
2260
2261 while True:
2262 self.report_download_page(query, pagenum)
2263 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2264 request = urllib2.Request(result_url)
2265 try:
2266 page = urllib2.urlopen(request).read()
2267 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2268 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2269 return
2270
2271 # Extract video identifiers
2272 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2273 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2274 if video_id not in already_seen:
2275 video_ids.append(video_id)
2276 already_seen.add(video_id)
2277 if len(video_ids) == n:
2278 # Specified n videos reached
2279 for id in video_ids:
2280 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2281 return
2282
2283 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2284 for id in video_ids:
2285 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2286 return
2287
2288 pagenum = pagenum + 1
2289
2290
2291 class GoogleSearchIE(InfoExtractor):
2292 """Information Extractor for Google Video search queries."""
2293 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2294 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2295 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2296 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2297 _google_ie = None
2298 _max_google_results = 1000
2299 IE_NAME = u'video.google:search'
2300
2301 def __init__(self, google_ie, downloader=None):
2302 InfoExtractor.__init__(self, downloader)
2303 self._google_ie = google_ie
2304
2305 def report_download_page(self, query, pagenum):
2306 """Report attempt to download playlist page with given number."""
2307 query = query.decode(preferredencoding())
2308 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2309
2310 def _real_initialize(self):
2311 self._google_ie.initialize()
2312
2313 def _real_extract(self, query):
2314 mobj = re.match(self._VALID_URL, query)
2315 if mobj is None:
2316 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2317 return
2318
2319 prefix, query = query.split(':')
2320 prefix = prefix[8:]
2321 query = query.encode('utf-8')
2322 if prefix == '':
2323 self._download_n_results(query, 1)
2324 return
2325 elif prefix == 'all':
2326 self._download_n_results(query, self._max_google_results)
2327 return
2328 else:
2329 try:
2330 n = long(prefix)
2331 if n <= 0:
2332 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2333 return
2334 elif n > self._max_google_results:
2335 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2336 n = self._max_google_results
2337 self._download_n_results(query, n)
2338 return
2339 except ValueError: # parsing prefix as integer fails
2340 self._download_n_results(query, 1)
2341 return
2342
2343 def _download_n_results(self, query, n):
2344 """Downloads a specified number of results for a query"""
2345
2346 video_ids = []
2347 already_seen = set()
2348 pagenum = 1
2349
2350 while True:
2351 self.report_download_page(query, pagenum)
2352 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2353 request = urllib2.Request(result_url)
2354 try:
2355 page = urllib2.urlopen(request).read()
2356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2357 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2358 return
2359
2360 # Extract video identifiers
2361 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2362 video_id = mobj.group(1)
2363 if video_id not in already_seen:
2364 video_ids.append(video_id)
2365 already_seen.add(video_id)
2366 if len(video_ids) == n:
2367 # Specified n videos reached
2368 for id in video_ids:
2369 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2370 return
2371
2372 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2373 for id in video_ids:
2374 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2375 return
2376
2377 pagenum = pagenum + 1
2378
2379
2380 class YahooSearchIE(InfoExtractor):
2381 """Information Extractor for Yahoo! Video search queries."""
2382 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2383 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2384 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2385 _MORE_PAGES_INDICATOR = r'\s*Next'
2386 _yahoo_ie = None
2387 _max_yahoo_results = 1000
2388 IE_NAME = u'video.yahoo:search'
2389
2390 def __init__(self, yahoo_ie, downloader=None):
2391 InfoExtractor.__init__(self, downloader)
2392 self._yahoo_ie = yahoo_ie
2393
2394 def report_download_page(self, query, pagenum):
2395 """Report attempt to download playlist page with given number."""
2396 query = query.decode(preferredencoding())
2397 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2398
2399 def _real_initialize(self):
2400 self._yahoo_ie.initialize()
2401
2402 def _real_extract(self, query):
2403 mobj = re.match(self._VALID_URL, query)
2404 if mobj is None:
2405 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2406 return
2407
2408 prefix, query = query.split(':')
2409 prefix = prefix[8:]
2410 query = query.encode('utf-8')
2411 if prefix == '':
2412 self._download_n_results(query, 1)
2413 return
2414 elif prefix == 'all':
2415 self._download_n_results(query, self._max_yahoo_results)
2416 return
2417 else:
2418 try:
2419 n = long(prefix)
2420 if n <= 0:
2421 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2422 return
2423 elif n > self._max_yahoo_results:
2424 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2425 n = self._max_yahoo_results
2426 self._download_n_results(query, n)
2427 return
2428 except ValueError: # parsing prefix as integer fails
2429 self._download_n_results(query, 1)
2430 return
2431
2432 def _download_n_results(self, query, n):
2433 """Downloads a specified number of results for a query"""
2434
2435 video_ids = []
2436 already_seen = set()
2437 pagenum = 1
2438
2439 while True:
2440 self.report_download_page(query, pagenum)
2441 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2442 request = urllib2.Request(result_url)
2443 try:
2444 page = urllib2.urlopen(request).read()
2445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2446 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2447 return
2448
2449 # Extract video identifiers
2450 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2451 video_id = mobj.group(1)
2452 if video_id not in already_seen:
2453 video_ids.append(video_id)
2454 already_seen.add(video_id)
2455 if len(video_ids) == n:
2456 # Specified n videos reached
2457 for id in video_ids:
2458 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2459 return
2460
2461 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2462 for id in video_ids:
2463 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2464 return
2465
2466 pagenum = pagenum + 1
2467
2468
2469 class YoutubePlaylistIE(InfoExtractor):
2470 """Information Extractor for YouTube playlists."""
2471
2472 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2473 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2474 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2475 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2476 _youtube_ie = None
2477 IE_NAME = u'youtube:playlist'
2478
2479 def __init__(self, youtube_ie, downloader=None):
2480 InfoExtractor.__init__(self, downloader)
2481 self._youtube_ie = youtube_ie
2482
2483 def report_download_page(self, playlist_id, pagenum):
2484 """Report attempt to download playlist page with given number."""
2485 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2486
2487 def _real_initialize(self):
2488 self._youtube_ie.initialize()
2489
2490 def _real_extract(self, url):
2491 # Extract playlist id
2492 mobj = re.match(self._VALID_URL, url)
2493 if mobj is None:
2494 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2495 return
2496
2497 # Single video case
2498 if mobj.group(3) is not None:
2499 self._youtube_ie.extract(mobj.group(3))
2500 return
2501
2502 # Download playlist pages
2503 # prefix is 'p' as default for playlists but there are other types that need extra care
2504 playlist_prefix = mobj.group(1)
2505 if playlist_prefix == 'a':
2506 playlist_access = 'artist'
2507 else:
2508 playlist_prefix = 'p'
2509 playlist_access = 'view_play_list'
2510 playlist_id = mobj.group(2)
2511 video_ids = []
2512 pagenum = 1
2513
2514 while True:
2515 self.report_download_page(playlist_id, pagenum)
2516 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2517 request = urllib2.Request(url)
2518 try:
2519 page = urllib2.urlopen(request).read()
2520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2522 return
2523
2524 # Extract video identifiers
2525 ids_in_page = []
2526 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2527 if mobj.group(1) not in ids_in_page:
2528 ids_in_page.append(mobj.group(1))
2529 video_ids.extend(ids_in_page)
2530
2531 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2532 break
2533 pagenum = pagenum + 1
2534
2535 playliststart = self._downloader.params.get('playliststart', 1) - 1
2536 playlistend = self._downloader.params.get('playlistend', -1)
2537 video_ids = video_ids[playliststart:playlistend]
2538
2539 for id in video_ids:
2540 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2541 return
2542
2543
2544 class YoutubeUserIE(InfoExtractor):
2545 """Information Extractor for YouTube users."""
2546
2547 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2548 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2549 _GDATA_PAGE_SIZE = 50
2550 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2551 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2552 _youtube_ie = None
2553 IE_NAME = u'youtube:user'
2554
2555 def __init__(self, youtube_ie, downloader=None):
2556 InfoExtractor.__init__(self, downloader)
2557 self._youtube_ie = youtube_ie
2558
2559 def report_download_page(self, username, start_index):
2560 """Report attempt to download user page."""
2561 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2562 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2563
2564 def _real_initialize(self):
2565 self._youtube_ie.initialize()
2566
2567 def _real_extract(self, url):
2568 # Extract username
2569 mobj = re.match(self._VALID_URL, url)
2570 if mobj is None:
2571 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2572 return
2573
2574 username = mobj.group(1)
2575
2576 # Download video ids using YouTube Data API. Result size per
2577 # query is limited (currently to 50 videos) so we need to query
2578 # page by page until there are no video ids - it means we got
2579 # all of them.
2580
2581 video_ids = []
2582 pagenum = 0
2583
2584 while True:
2585 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2586 self.report_download_page(username, start_index)
2587
2588 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2589
2590 try:
2591 page = urllib2.urlopen(request).read()
2592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2594 return
2595
2596 # Extract video identifiers
2597 ids_in_page = []
2598
2599 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2600 if mobj.group(1) not in ids_in_page:
2601 ids_in_page.append(mobj.group(1))
2602
2603 video_ids.extend(ids_in_page)
2604
2605 # A little optimization - if current page is not
2606 # "full", ie. does not contain PAGE_SIZE video ids then
2607 # we can assume that this page is the last one - there
2608 # are no more ids on further pages - no need to query
2609 # again.
2610
2611 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2612 break
2613
2614 pagenum += 1
2615
2616 all_ids_count = len(video_ids)
2617 playliststart = self._downloader.params.get('playliststart', 1) - 1
2618 playlistend = self._downloader.params.get('playlistend', -1)
2619
2620 if playlistend == -1:
2621 video_ids = video_ids[playliststart:]
2622 else:
2623 video_ids = video_ids[playliststart:playlistend]
2624
2625 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2626 (username, all_ids_count, len(video_ids)))
2627
2628 for video_id in video_ids:
2629 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2630
2631
2632 class DepositFilesIE(InfoExtractor):
2633 """Information extractor for depositfiles.com"""
2634
2635 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2636 IE_NAME = u'DepositFiles'
2637
2638 def __init__(self, downloader=None):
2639 InfoExtractor.__init__(self, downloader)
2640
2641 def report_download_webpage(self, file_id):
2642 """Report webpage download."""
2643 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2644
2645 def report_extraction(self, file_id):
2646 """Report information extraction."""
2647 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2648
2649 def _real_extract(self, url):
2650 # At this point we have a new file
2651 self._downloader.increment_downloads()
2652
2653 file_id = url.split('/')[-1]
2654 # Rebuild url in english locale
2655 url = 'http://depositfiles.com/en/files/' + file_id
2656
2657 # Retrieve file webpage with 'Free download' button pressed
2658 free_download_indication = { 'gateway_result' : '1' }
2659 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2660 try:
2661 self.report_download_webpage(file_id)
2662 webpage = urllib2.urlopen(request).read()
2663 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2664 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2665 return
2666
2667 # Search for the real file URL
2668 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2669 if (mobj is None) or (mobj.group(1) is None):
2670 # Try to figure out reason of the error.
2671 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2672 if (mobj is not None) and (mobj.group(1) is not None):
2673 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2674 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2675 else:
2676 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2677 return
2678
2679 file_url = mobj.group(1)
2680 file_extension = os.path.splitext(file_url)[1][1:]
2681
2682 # Search for file title
2683 mobj = re.search(r'<b title="(.*?)">', webpage)
2684 if mobj is None:
2685 self._downloader.trouble(u'ERROR: unable to extract title')
2686 return
2687 file_title = mobj.group(1).decode('utf-8')
2688
2689 try:
2690 # Process file information
2691 self._downloader.process_info({
2692 'id': file_id.decode('utf-8'),
2693 'url': file_url.decode('utf-8'),
2694 'uploader': u'NA',
2695 'upload_date': u'NA',
2696 'title': file_title,
2697 'stitle': file_title,
2698 'ext': file_extension.decode('utf-8'),
2699 'format': u'NA',
2700 'player_url': None,
2701 })
2702 except UnavailableVideoError, err:
2703 self._downloader.trouble(u'ERROR: unable to download file')
2704
2705
2706 class FacebookIE(InfoExtractor):
2707 """Information Extractor for Facebook"""
2708
2709 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2710 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2711 _NETRC_MACHINE = 'facebook'
2712 _available_formats = ['video', 'highqual', 'lowqual']
2713 _video_extensions = {
2714 'video': 'mp4',
2715 'highqual': 'mp4',
2716 'lowqual': 'mp4',
2717 }
2718 IE_NAME = u'facebook'
2719
2720 def __init__(self, downloader=None):
2721 InfoExtractor.__init__(self, downloader)
2722
2723 def _reporter(self, message):
2724 """Add header and report message."""
2725 self._downloader.to_screen(u'[facebook] %s' % message)
2726
2727 def report_login(self):
2728 """Report attempt to log in."""
2729 self._reporter(u'Logging in')
2730
2731 def report_video_webpage_download(self, video_id):
2732 """Report attempt to download video webpage."""
2733 self._reporter(u'%s: Downloading video webpage' % video_id)
2734
2735 def report_information_extraction(self, video_id):
2736 """Report attempt to extract video information."""
2737 self._reporter(u'%s: Extracting video information' % video_id)
2738
2739 def _parse_page(self, video_webpage):
2740 """Extract video information from page"""
2741 # General data
2742 data = {'title': r'\("video_title", "(.*?)"\)',
2743 'description': r'<div class="datawrap">(.*?)</div>',
2744 'owner': r'\("video_owner_name", "(.*?)"\)',
2745 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2746 }
2747 video_info = {}
2748 for piece in data.keys():
2749 mobj = re.search(data[piece], video_webpage)
2750 if mobj is not None:
2751 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2752
2753 # Video urls
2754 video_urls = {}
2755 for fmt in self._available_formats:
2756 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2757 if mobj is not None:
2758 # URL is in a Javascript segment inside an escaped Unicode format within
2759 # the generally utf-8 page
2760 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2761 video_info['video_urls'] = video_urls
2762
2763 return video_info
2764
2765 def _real_initialize(self):
2766 if self._downloader is None:
2767 return
2768
2769 useremail = None
2770 password = None
2771 downloader_params = self._downloader.params
2772
2773 # Attempt to use provided username and password or .netrc data
2774 if downloader_params.get('username', None) is not None:
2775 useremail = downloader_params['username']
2776 password = downloader_params['password']
2777 elif downloader_params.get('usenetrc', False):
2778 try:
2779 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2780 if info is not None:
2781 useremail = info[0]
2782 password = info[2]
2783 else:
2784 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2785 except (IOError, netrc.NetrcParseError), err:
2786 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2787 return
2788
2789 if useremail is None:
2790 return
2791
2792 # Log in
2793 login_form = {
2794 'email': useremail,
2795 'pass': password,
2796 'login': 'Log+In'
2797 }
2798 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2799 try:
2800 self.report_login()
2801 login_results = urllib2.urlopen(request).read()
2802 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2803 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2804 return
2805 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2806 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2807 return
2808
2809 def _real_extract(self, url):
2810 mobj = re.match(self._VALID_URL, url)
2811 if mobj is None:
2812 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813 return
2814 video_id = mobj.group('ID')
2815
2816 # Get video webpage
2817 self.report_video_webpage_download(video_id)
2818 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2819 try:
2820 page = urllib2.urlopen(request)
2821 video_webpage = page.read()
2822 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2823 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2824 return
2825
2826 # Start extracting information
2827 self.report_information_extraction(video_id)
2828
2829 # Extract information
2830 video_info = self._parse_page(video_webpage)
2831
2832 # uploader
2833 if 'owner' not in video_info:
2834 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2835 return
2836 video_uploader = video_info['owner']
2837
2838 # title
2839 if 'title' not in video_info:
2840 self._downloader.trouble(u'ERROR: unable to extract video title')
2841 return
2842 video_title = video_info['title']
2843 video_title = video_title.decode('utf-8')
2844 video_title = sanitize_title(video_title)
2845
2846 simple_title = _simplify_title(video_title)
2847
2848 # thumbnail image
2849 if 'thumbnail' not in video_info:
2850 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2851 video_thumbnail = ''
2852 else:
2853 video_thumbnail = video_info['thumbnail']
2854
2855 # upload date
2856 upload_date = u'NA'
2857 if 'upload_date' in video_info:
2858 upload_time = video_info['upload_date']
2859 timetuple = email.utils.parsedate_tz(upload_time)
2860 if timetuple is not None:
2861 try:
2862 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2863 except:
2864 pass
2865
2866 # description
2867 video_description = video_info.get('description', 'No description available.')
2868
2869 url_map = video_info['video_urls']
2870 if len(url_map.keys()) > 0:
2871 # Decide which formats to download
2872 req_format = self._downloader.params.get('format', None)
2873 format_limit = self._downloader.params.get('format_limit', None)
2874
2875 if format_limit is not None and format_limit in self._available_formats:
2876 format_list = self._available_formats[self._available_formats.index(format_limit):]
2877 else:
2878 format_list = self._available_formats
2879 existing_formats = [x for x in format_list if x in url_map]
2880 if len(existing_formats) == 0:
2881 self._downloader.trouble(u'ERROR: no known formats available for video')
2882 return
2883 if req_format is None:
2884 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2885 elif req_format == 'worst':
2886 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2887 elif req_format == '-1':
2888 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2889 else:
2890 # Specific format
2891 if req_format not in url_map:
2892 self._downloader.trouble(u'ERROR: requested format not available')
2893 return
2894 video_url_list = [(req_format, url_map[req_format])] # Specific format
2895
2896 for format_param, video_real_url in video_url_list:
2897
2898 # At this point we have a new video
2899 self._downloader.increment_downloads()
2900
2901 # Extension
2902 video_extension = self._video_extensions.get(format_param, 'mp4')
2903
2904 try:
2905 # Process video information
2906 self._downloader.process_info({
2907 'id': video_id.decode('utf-8'),
2908 'url': video_real_url.decode('utf-8'),
2909 'uploader': video_uploader.decode('utf-8'),
2910 'upload_date': upload_date,
2911 'title': video_title,
2912 'stitle': simple_title,
2913 'ext': video_extension.decode('utf-8'),
2914 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2915 'thumbnail': video_thumbnail.decode('utf-8'),
2916 'description': video_description.decode('utf-8'),
2917 'player_url': None,
2918 })
2919 except UnavailableVideoError, err:
2920 self._downloader.trouble(u'\nERROR: unable to download video')
2921
2922 class BlipTVIE(InfoExtractor):
2923 """Information extractor for blip.tv"""
2924
2925 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2926 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2927 IE_NAME = u'blip.tv'
2928
2929 def report_extraction(self, file_id):
2930 """Report information extraction."""
2931 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2932
2933 def report_direct_download(self, title):
2934 """Report information extraction."""
2935 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2936
2937 def _real_extract(self, url):
2938 mobj = re.match(self._VALID_URL, url)
2939 if mobj is None:
2940 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2941 return
2942
2943 if '?' in url:
2944 cchar = '&'
2945 else:
2946 cchar = '?'
2947 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2948 request = urllib2.Request(json_url)
2949 self.report_extraction(mobj.group(1))
2950 info = None
2951 try:
2952 urlh = urllib2.urlopen(request)
2953 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2954 basename = url.split('/')[-1]
2955 title,ext = os.path.splitext(basename)
2956 title = title.decode('UTF-8')
2957 ext = ext.replace('.', '')
2958 self.report_direct_download(title)
2959 info = {
2960 'id': title,
2961 'url': url,
2962 'title': title,
2963 'stitle': _simplify_title(title),
2964 'ext': ext,
2965 'urlhandle': urlh
2966 }
2967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2968 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2969 return
2970 if info is None: # Regular URL
2971 try:
2972 json_code = urlh.read()
2973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2975 return
2976
2977 try:
2978 json_data = json.loads(json_code)
2979 if 'Post' in json_data:
2980 data = json_data['Post']
2981 else:
2982 data = json_data
2983
2984 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2985 video_url = data['media']['url']
2986 umobj = re.match(self._URL_EXT, video_url)
2987 if umobj is None:
2988 raise ValueError('Can not determine filename extension')
2989 ext = umobj.group(1)
2990
2991 info = {
2992 'id': data['item_id'],
2993 'url': video_url,
2994 'uploader': data['display_name'],
2995 'upload_date': upload_date,
2996 'title': data['title'],
2997 'stitle': _simplify_title(data['title']),
2998 'ext': ext,
2999 'format': data['media']['mimeType'],
3000 'thumbnail': data['thumbnailUrl'],
3001 'description': data['description'],
3002 'player_url': data['embedUrl']
3003 }
3004 except (ValueError,KeyError), err:
3005 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3006 return
3007
3008 self._downloader.increment_downloads()
3009
3010 try:
3011 self._downloader.process_info(info)
3012 except UnavailableVideoError, err:
3013 self._downloader.trouble(u'\nERROR: unable to download video')
3014
3015
3016 class MyVideoIE(InfoExtractor):
3017 """Information Extractor for myvideo.de."""
3018
3019 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3020 IE_NAME = u'myvideo'
3021
3022 def __init__(self, downloader=None):
3023 InfoExtractor.__init__(self, downloader)
3024
3025 def report_download_webpage(self, video_id):
3026 """Report webpage download."""
3027 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3028
3029 def report_extraction(self, video_id):
3030 """Report information extraction."""
3031 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3032
3033 def _real_extract(self,url):
3034 mobj = re.match(self._VALID_URL, url)
3035 if mobj is None:
3036 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3037 return
3038
3039 video_id = mobj.group(1)
3040
3041 # Get video webpage
3042 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3043 try:
3044 self.report_download_webpage(video_id)
3045 webpage = urllib2.urlopen(request).read()
3046 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3047 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3048 return
3049
3050 self.report_extraction(video_id)
3051 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3052 webpage)
3053 if mobj is None:
3054 self._downloader.trouble(u'ERROR: unable to extract media URL')
3055 return
3056 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3057
3058 mobj = re.search('<title>([^<]+)</title>', webpage)
3059 if mobj is None:
3060 self._downloader.trouble(u'ERROR: unable to extract title')
3061 return
3062
3063 video_title = mobj.group(1)
3064 video_title = sanitize_title(video_title)
3065
3066 simple_title = _simplify_title(video_title)
3067
3068 try:
3069 self._downloader.process_info({
3070 'id': video_id,
3071 'url': video_url,
3072 'uploader': u'NA',
3073 'upload_date': u'NA',
3074 'title': video_title,
3075 'stitle': simple_title,
3076 'ext': u'flv',
3077 'format': u'NA',
3078 'player_url': None,
3079 })
3080 except UnavailableVideoError:
3081 self._downloader.trouble(u'\nERROR: Unable to download video')
3082
3083 class ComedyCentralIE(InfoExtractor):
3084 """Information extractor for The Daily Show and Colbert Report """
3085
3086 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3087 IE_NAME = u'comedycentral'
3088
3089 def report_extraction(self, episode_id):
3090 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3091
3092 def report_config_download(self, episode_id):
3093 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3094
3095 def report_index_download(self, episode_id):
3096 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3097
3098 def report_player_url(self, episode_id):
3099 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3100
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3103 if mobj is None:
3104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105 return
3106
3107 if mobj.group('shortname'):
3108 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3109 url = u'http://www.thedailyshow.com/full-episodes/'
3110 else:
3111 url = u'http://www.colbertnation.com/full-episodes/'
3112 mobj = re.match(self._VALID_URL, url)
3113 assert mobj is not None
3114
3115 dlNewest = not mobj.group('episode')
3116 if dlNewest:
3117 epTitle = mobj.group('showname')
3118 else:
3119 epTitle = mobj.group('episode')
3120
3121 req = urllib2.Request(url)
3122 self.report_extraction(epTitle)
3123 try:
3124 htmlHandle = urllib2.urlopen(req)
3125 html = htmlHandle.read()
3126 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3127 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3128 return
3129 if dlNewest:
3130 url = htmlHandle.geturl()
3131 mobj = re.match(self._VALID_URL, url)
3132 if mobj is None:
3133 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3134 return
3135 if mobj.group('episode') == '':
3136 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3137 return
3138 epTitle = mobj.group('episode')
3139
3140 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3141 if len(mMovieParams) == 0:
3142 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3143 return
3144
3145 playerUrl_raw = mMovieParams[0][0]
3146 self.report_player_url(epTitle)
3147 try:
3148 urlHandle = urllib2.urlopen(playerUrl_raw)
3149 playerUrl = urlHandle.geturl()
3150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3152 return
3153
3154 uri = mMovieParams[0][1]
3155 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3156 self.report_index_download(epTitle)
3157 try:
3158 indexXml = urllib2.urlopen(indexUrl).read()
3159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3161 return
3162
3163 idoc = xml.etree.ElementTree.fromstring(indexXml)
3164 itemEls = idoc.findall('.//item')
3165 for itemEl in itemEls:
3166 mediaId = itemEl.findall('./guid')[0].text
3167 shortMediaId = mediaId.split(':')[-1]
3168 showId = mediaId.split(':')[-2].replace('.com', '')
3169 officialTitle = itemEl.findall('./title')[0].text
3170 officialDate = itemEl.findall('./pubDate')[0].text
3171
3172 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3173 urllib.urlencode({'uri': mediaId}))
3174 configReq = urllib2.Request(configUrl)
3175 self.report_config_download(epTitle)
3176 try:
3177 configXml = urllib2.urlopen(configReq).read()
3178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3180 return
3181
3182 cdoc = xml.etree.ElementTree.fromstring(configXml)
3183 turls = []
3184 for rendition in cdoc.findall('.//rendition'):
3185 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3186 turls.append(finfo)
3187
3188 if len(turls) == 0:
3189 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3190 continue
3191
3192 # For now, just pick the highest bitrate
3193 format,video_url = turls[-1]
3194
3195 self._downloader.increment_downloads()
3196
3197 effTitle = showId + u'-' + epTitle
3198 info = {
3199 'id': shortMediaId,
3200 'url': video_url,
3201 'uploader': showId,
3202 'upload_date': officialDate,
3203 'title': effTitle,
3204 'stitle': _simplify_title(effTitle),
3205 'ext': 'mp4',
3206 'format': format,
3207 'thumbnail': None,
3208 'description': officialTitle,
3209 'player_url': playerUrl
3210 }
3211
3212 try:
3213 self._downloader.process_info(info)
3214 except UnavailableVideoError, err:
3215 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3216 continue
3217
3218
3219 class EscapistIE(InfoExtractor):
3220 """Information extractor for The Escapist """
3221
3222 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3223 IE_NAME = u'escapist'
3224
3225 def report_extraction(self, showName):
3226 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3227
3228 def report_config_download(self, showName):
3229 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3230
3231 def _real_extract(self, url):
3232 htmlParser = HTMLParser.HTMLParser()
3233
3234 mobj = re.match(self._VALID_URL, url)
3235 if mobj is None:
3236 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3237 return
3238 showName = mobj.group('showname')
3239 videoId = mobj.group('episode')
3240
3241 self.report_extraction(showName)
3242 try:
3243 webPage = urllib2.urlopen(url).read()
3244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3245 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3246 return
3247
3248 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3249 description = htmlParser.unescape(descMatch.group(1))
3250 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3251 imgUrl = htmlParser.unescape(imgMatch.group(1))
3252 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3253 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3254 configUrlMatch = re.search('config=(.*)$', playerUrl)
3255 configUrl = urllib2.unquote(configUrlMatch.group(1))
3256
3257 self.report_config_download(showName)
3258 try:
3259 configJSON = urllib2.urlopen(configUrl).read()
3260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3261 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3262 return
3263
3264 # Technically, it's JavaScript, not JSON
3265 configJSON = configJSON.replace("'", '"')
3266
3267 try:
3268 config = json.loads(configJSON)
3269 except (ValueError,), err:
3270 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3271 return
3272
3273 playlist = config['playlist']
3274 videoUrl = playlist[1]['url']
3275
3276 self._downloader.increment_downloads()
3277 info = {
3278 'id': videoId,
3279 'url': videoUrl,
3280 'uploader': showName,
3281 'upload_date': None,
3282 'title': showName,
3283 'stitle': _simplify_title(showName),
3284 'ext': 'flv',
3285 'format': 'flv',
3286 'thumbnail': imgUrl,
3287 'description': description,
3288 'player_url': playerUrl,
3289 }
3290
3291 try:
3292 self._downloader.process_info(info)
3293 except UnavailableVideoError, err:
3294 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3295
3296
3297 class CollegeHumorIE(InfoExtractor):
3298 """Information extractor for collegehumor.com"""
3299
3300 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3301 IE_NAME = u'collegehumor'
3302
3303 def report_webpage(self, video_id):
3304 """Report information extraction."""
3305 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3306
3307 def report_extraction(self, video_id):
3308 """Report information extraction."""
3309 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3310
3311 def _real_extract(self, url):
3312 htmlParser = HTMLParser.HTMLParser()
3313
3314 mobj = re.match(self._VALID_URL, url)
3315 if mobj is None:
3316 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3317 return
3318 video_id = mobj.group('videoid')
3319
3320 self.report_webpage(video_id)
3321 request = urllib2.Request(url)
3322 try:
3323 webpage = urllib2.urlopen(request).read()
3324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3326 return
3327
3328 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3329 if m is None:
3330 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3331 return
3332 internal_video_id = m.group('internalvideoid')
3333
3334 info = {
3335 'id': video_id,
3336 'internal_id': internal_video_id,
3337 }
3338
3339 self.report_extraction(video_id)
3340 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3341 try:
3342 metaXml = urllib2.urlopen(xmlUrl).read()
3343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3344 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3345 return
3346
3347 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3348 try:
3349 videoNode = mdoc.findall('./video')[0]
3350 info['description'] = videoNode.findall('./description')[0].text
3351 info['title'] = videoNode.findall('./caption')[0].text
3352 info['stitle'] = _simplify_title(info['title'])
3353 info['url'] = videoNode.findall('./file')[0].text
3354 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3355 info['ext'] = info['url'].rpartition('.')[2]
3356 info['format'] = info['ext']
3357 except IndexError:
3358 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3359 return
3360
3361 self._downloader.increment_downloads()
3362
3363 try:
3364 self._downloader.process_info(info)
3365 except UnavailableVideoError, err:
3366 self._downloader.trouble(u'\nERROR: unable to download video')
3367
3368
3369 class XVideosIE(InfoExtractor):
3370 """Information extractor for xvideos.com"""
3371
3372 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3373 IE_NAME = u'xvideos'
3374
3375 def report_webpage(self, video_id):
3376 """Report information extraction."""
3377 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3378
3379 def report_extraction(self, video_id):
3380 """Report information extraction."""
3381 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3382
3383 def _real_extract(self, url):
3384 htmlParser = HTMLParser.HTMLParser()
3385
3386 mobj = re.match(self._VALID_URL, url)
3387 if mobj is None:
3388 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3389 return
3390 video_id = mobj.group(1).decode('utf-8')
3391
3392 self.report_webpage(video_id)
3393
3394 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3395 try:
3396 webpage = urllib2.urlopen(request).read()
3397 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3398 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3399 return
3400
3401 self.report_extraction(video_id)
3402
3403
3404 # Extract video URL
3405 mobj = re.search(r'flv_url=(.+?)&', webpage)
3406 if mobj is None:
3407 self._downloader.trouble(u'ERROR: unable to extract video url')
3408 return
3409 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3410
3411
3412 # Extract title
3413 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3414 if mobj is None:
3415 self._downloader.trouble(u'ERROR: unable to extract video title')
3416 return
3417 video_title = mobj.group(1).decode('utf-8')
3418
3419
3420 # Extract video thumbnail
3421 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3422 if mobj is None:
3423 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3424 return
3425 video_thumbnail = mobj.group(1).decode('utf-8')
3426
3427
3428
3429 self._downloader.increment_downloads()
3430 info = {
3431 'id': video_id,
3432 'url': video_url,
3433 'uploader': None,
3434 'upload_date': None,
3435 'title': video_title,
3436 'stitle': _simplify_title(video_title),
3437 'ext': 'flv',
3438 'format': 'flv',
3439 'thumbnail': video_thumbnail,
3440 'description': None,
3441 'player_url': None,
3442 }
3443
3444 try:
3445 self._downloader.process_info(info)
3446 except UnavailableVideoError, err:
3447 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3448
3449
3450 class SoundcloudIE(InfoExtractor):
3451 """Information extractor for soundcloud.com
3452 To access the media, the uid of the song and a stream token
3453 must be extracted from the page source and the script must make
3454 a request to media.soundcloud.com/crossdomain.xml. Then
3455 the media can be grabbed by requesting from an url composed
3456 of the stream token and uid
3457 """
3458
3459 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3460 IE_NAME = u'soundcloud'
3461
3462 def __init__(self, downloader=None):
3463 InfoExtractor.__init__(self, downloader)
3464
3465 def report_webpage(self, video_id):
3466 """Report information extraction."""
3467 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3468
3469 def report_extraction(self, video_id):
3470 """Report information extraction."""
3471 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3472
3473 def _real_extract(self, url):
3474 htmlParser = HTMLParser.HTMLParser()
3475
3476 mobj = re.match(self._VALID_URL, url)
3477 if mobj is None:
3478 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3479 return
3480
3481 # extract uploader (which is in the url)
3482 uploader = mobj.group(1).decode('utf-8')
3483 # extract simple title (uploader + slug of song title)
3484 slug_title = mobj.group(2).decode('utf-8')
3485 simple_title = uploader + '-' + slug_title
3486
3487 self.report_webpage('%s/%s' % (uploader, slug_title))
3488
3489 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3490 try:
3491 webpage = urllib2.urlopen(request).read()
3492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3493 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3494 return
3495
3496 self.report_extraction('%s/%s' % (uploader, slug_title))
3497
3498 # extract uid and stream token that soundcloud hands out for access
3499 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3500 if mobj:
3501 video_id = mobj.group(1)
3502 stream_token = mobj.group(2)
3503
3504 # extract unsimplified title
3505 mobj = re.search('"title":"(.*?)",', webpage)
3506 if mobj:
3507 title = mobj.group(1)
3508
3509 # construct media url (with uid/token)
3510 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3511 mediaURL = mediaURL % (video_id, stream_token)
3512
3513 # description
3514 description = u'No description available'
3515 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3516 if mobj:
3517 description = mobj.group(1)
3518
3519 # upload date
3520 upload_date = None
3521 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3522 if mobj:
3523 try:
3524 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3525 except Exception, e:
3526 print str(e)
3527
3528 # for soundcloud, a request to a cross domain is required for cookies
3529 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3530
3531 try:
3532 self._downloader.process_info({
3533 'id': video_id.decode('utf-8'),
3534 'url': mediaURL,
3535 'uploader': uploader.decode('utf-8'),
3536 'upload_date': upload_date,
3537 'title': simple_title.decode('utf-8'),
3538 'stitle': simple_title.decode('utf-8'),
3539 'ext': u'mp3',
3540 'format': u'NA',
3541 'player_url': None,
3542 'description': description.decode('utf-8')
3543 })
3544 except UnavailableVideoError:
3545 self._downloader.trouble(u'\nERROR: unable to download video')
3546
3547
3548 class InfoQIE(InfoExtractor):
3549 """Information extractor for infoq.com"""
3550
3551 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3552 IE_NAME = u'infoq'
3553
3554 def report_webpage(self, video_id):
3555 """Report information extraction."""
3556 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3557
3558 def report_extraction(self, video_id):
3559 """Report information extraction."""
3560 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3561
3562 def _real_extract(self, url):
3563 htmlParser = HTMLParser.HTMLParser()
3564
3565 mobj = re.match(self._VALID_URL, url)
3566 if mobj is None:
3567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568 return
3569
3570 self.report_webpage(url)
3571
3572 request = urllib2.Request(url)
3573 try:
3574 webpage = urllib2.urlopen(request).read()
3575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3576 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3577 return
3578
3579 self.report_extraction(url)
3580
3581
3582 # Extract video URL
3583 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3584 if mobj is None:
3585 self._downloader.trouble(u'ERROR: unable to extract video url')
3586 return
3587 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3588
3589
3590 # Extract title
3591 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3592 if mobj is None:
3593 self._downloader.trouble(u'ERROR: unable to extract video title')
3594 return
3595 video_title = mobj.group(1).decode('utf-8')
3596
3597 # Extract description
3598 video_description = u'No description available.'
3599 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3600 if mobj is not None:
3601 video_description = mobj.group(1).decode('utf-8')
3602
3603 video_filename = video_url.split('/')[-1]
3604 video_id, extension = video_filename.split('.')
3605
3606 self._downloader.increment_downloads()
3607 info = {
3608 'id': video_id,
3609 'url': video_url,
3610 'uploader': None,
3611 'upload_date': None,
3612 'title': video_title,
3613 'stitle': _simplify_title(video_title),
3614 'ext': extension,
3615 'format': extension, # Extension is always(?) mp4, but seems to be flv
3616 'thumbnail': None,
3617 'description': video_description,
3618 'player_url': None,
3619 }
3620
3621 try:
3622 self._downloader.process_info(info)
3623 except UnavailableVideoError, err:
3624 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3625
3626 class MixcloudIE(InfoExtractor):
3627 """Information extractor for www.mixcloud.com"""
3628 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3629 IE_NAME = u'mixcloud'
3630
3631 def __init__(self, downloader=None):
3632 InfoExtractor.__init__(self, downloader)
3633
3634 def report_download_json(self, file_id):
3635 """Report JSON download."""
3636 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3637
3638 def report_extraction(self, file_id):
3639 """Report information extraction."""
3640 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3641
3642 def get_urls(self, jsonData, fmt, bitrate='best'):
3643 """Get urls from 'audio_formats' section in json"""
3644 file_url = None
3645 try:
3646 bitrate_list = jsonData[fmt]
3647 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3648 bitrate = max(bitrate_list) # select highest
3649
3650 url_list = jsonData[fmt][bitrate]
3651 except TypeError: # we have no bitrate info.
3652 url_list = jsonData[fmt]
3653
3654 return url_list
3655
3656 def check_urls(self, url_list):
3657 """Returns 1st active url from list"""
3658 for url in url_list:
3659 try:
3660 urllib2.urlopen(url)
3661 return url
3662 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3663 url = None
3664
3665 return None
3666
3667 def _print_formats(self, formats):
3668 print 'Available formats:'
3669 for fmt in formats.keys():
3670 for b in formats[fmt]:
3671 try:
3672 ext = formats[fmt][b][0]
3673 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3674 except TypeError: # we have no bitrate info
3675 ext = formats[fmt][0]
3676 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3677 break
3678
3679 def _real_extract(self, url):
3680 mobj = re.match(self._VALID_URL, url)
3681 if mobj is None:
3682 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3683 return
3684 # extract uploader & filename from url
3685 uploader = mobj.group(1).decode('utf-8')
3686 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3687
3688 # construct API request
3689 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3690 # retrieve .json file with links to files
3691 request = urllib2.Request(file_url)
3692 try:
3693 self.report_download_json(file_url)
3694 jsonData = urllib2.urlopen(request).read()
3695 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3696 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3697 return
3698
3699 # parse JSON
3700 json_data = json.loads(jsonData)
3701 player_url = json_data['player_swf_url']
3702 formats = dict(json_data['audio_formats'])
3703
3704 req_format = self._downloader.params.get('format', None)
3705 bitrate = None
3706
3707 if self._downloader.params.get('listformats', None):
3708 self._print_formats(formats)
3709 return
3710
3711 if req_format is None or req_format == 'best':
3712 for format_param in formats.keys():
3713 url_list = self.get_urls(formats, format_param)
3714 # check urls
3715 file_url = self.check_urls(url_list)
3716 if file_url is not None:
3717 break # got it!
3718 else:
3719 if req_format not in formats.keys():
3720 self._downloader.trouble(u'ERROR: format is not available')
3721 return
3722
3723 url_list = self.get_urls(formats, req_format)
3724 file_url = self.check_urls(url_list)
3725 format_param = req_format
3726
3727 # We have audio
3728 self._downloader.increment_downloads()
3729 try:
3730 # Process file information
3731 self._downloader.process_info({
3732 'id': file_id.decode('utf-8'),
3733 'url': file_url.decode('utf-8'),
3734 'uploader': uploader.decode('utf-8'),
3735 'upload_date': u'NA',
3736 'title': json_data['name'],
3737 'stitle': _simplify_title(json_data['name']),
3738 'ext': file_url.split('.')[-1].decode('utf-8'),
3739 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3740 'thumbnail': json_data['thumbnail_url'],
3741 'description': json_data['description'],
3742 'player_url': player_url.decode('utf-8'),
3743 })
3744 except UnavailableVideoError, err:
3745 self._downloader.trouble(u'ERROR: unable to download file')
3746
3747
3748
3749 class PostProcessor(object):
3750 """Post Processor class.
3751
3752 PostProcessor objects can be added to downloaders with their
3753 add_post_processor() method. When the downloader has finished a
3754 successful download, it will take its internal chain of PostProcessors
3755 and start calling the run() method on each one of them, first with
3756 an initial argument and then with the returned value of the previous
3757 PostProcessor.
3758
3759 The chain will be stopped if one of them ever returns None or the end
3760 of the chain is reached.
3761
3762 PostProcessor objects follow a "mutual registration" process similar
3763 to InfoExtractor objects.
3764 """
3765
3766 _downloader = None
3767
3768 def __init__(self, downloader=None):
3769 self._downloader = downloader
3770
3771 def set_downloader(self, downloader):
3772 """Sets the downloader for this PP."""
3773 self._downloader = downloader
3774
3775 def run(self, information):
3776 """Run the PostProcessor.
3777
3778 The "information" argument is a dictionary like the ones
3779 composed by InfoExtractors. The only difference is that this
3780 one has an extra field called "filepath" that points to the
3781 downloaded file.
3782
3783 When this method returns None, the postprocessing chain is
3784 stopped. However, this method may return an information
3785 dictionary that will be passed to the next postprocessing
3786 object in the chain. It can be the one it received after
3787 changing some fields.
3788
3789 In addition, this method may raise a PostProcessingError
3790 exception that will be taken into account by the downloader
3791 it was called from.
3792 """
3793 return information # by default, do nothing
3794
3795
3796 class FFmpegExtractAudioPP(PostProcessor):
3797
3798 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3799 PostProcessor.__init__(self, downloader)
3800 if preferredcodec is None:
3801 preferredcodec = 'best'
3802 self._preferredcodec = preferredcodec
3803 self._preferredquality = preferredquality
3804 self._keepvideo = keepvideo
3805
3806 @staticmethod
3807 def get_audio_codec(path):
3808 try:
3809 cmd = ['ffprobe', '-show_streams', '--', path]
3810 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3811 output = handle.communicate()[0]
3812 if handle.wait() != 0:
3813 return None
3814 except (IOError, OSError):
3815 return None
3816 audio_codec = None
3817 for line in output.split('\n'):
3818 if line.startswith('codec_name='):
3819 audio_codec = line.split('=')[1].strip()
3820 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3821 return audio_codec
3822 return None
3823
3824 @staticmethod
3825 def run_ffmpeg(path, out_path, codec, more_opts):
3826 try:
3827 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3828 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3829 return (ret == 0)
3830 except (IOError, OSError):
3831 return False
3832
3833 def run(self, information):
3834 path = information['filepath']
3835
3836 filecodec = self.get_audio_codec(path)
3837 if filecodec is None:
3838 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3839 return None
3840
3841 more_opts = []
3842 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3843 if filecodec in ['aac', 'mp3', 'vorbis']:
3844 # Lossless if possible
3845 acodec = 'copy'
3846 extension = filecodec
3847 if filecodec == 'aac':
3848 more_opts = ['-f', 'adts']
3849 if filecodec == 'vorbis':
3850 extension = 'ogg'
3851 else:
3852 # MP3 otherwise.
3853 acodec = 'libmp3lame'
3854 extension = 'mp3'
3855 more_opts = []
3856 if self._preferredquality is not None:
3857 more_opts += ['-ab', self._preferredquality]
3858 else:
3859 # We convert the audio (lossy)
3860 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3861 extension = self._preferredcodec
3862 more_opts = []
3863 if self._preferredquality is not None:
3864 more_opts += ['-ab', self._preferredquality]
3865 if self._preferredcodec == 'aac':
3866 more_opts += ['-f', 'adts']
3867 if self._preferredcodec == 'vorbis':
3868 extension = 'ogg'
3869
3870 (prefix, ext) = os.path.splitext(path)
3871 new_path = prefix + '.' + extension
3872 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3873 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3874
3875 if not status:
3876 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3877 return None
3878
3879 # Try to update the date time for extracted audio file.
3880 if information.get('filetime') is not None:
3881 try:
3882 os.utime(new_path, (time.time(), information['filetime']))
3883 except:
3884 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3885
3886 if not self._keepvideo:
3887 try:
3888 os.remove(path)
3889 except (IOError, OSError):
3890 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3891 return None
3892
3893 information['filepath'] = new_path
3894 return information
3895
3896
3897 def updateSelf(downloader, filename):
3898 ''' Update the program file with the latest version from the repository '''
3899 # Note: downloader only used for options
3900 if not os.access(filename, os.W_OK):
3901 sys.exit('ERROR: no write permissions on %s' % filename)
3902
3903 downloader.to_screen('Updating to latest version...')
3904
3905 try:
3906 try:
3907 urlh = urllib.urlopen(UPDATE_URL)
3908 newcontent = urlh.read()
3909
3910 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3911 if vmatch is not None and vmatch.group(1) == __version__:
3912 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3913 return
3914 finally:
3915 urlh.close()
3916 except (IOError, OSError), err:
3917 sys.exit('ERROR: unable to download latest version')
3918
3919 try:
3920 outf = open(filename, 'wb')
3921 try:
3922 outf.write(newcontent)
3923 finally:
3924 outf.close()
3925 except (IOError, OSError), err:
3926 sys.exit('ERROR: unable to overwrite current version')
3927
3928 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3929
3930 def parseOpts():
3931 # Deferred imports
3932 import getpass
3933 import optparse
3934 import shlex
3935
3936 def _readOptions(filename):
3937 try:
3938 optionf = open(filename)
3939 except IOError:
3940 return [] # silently skip if file is not present
3941 try:
3942 res = []
3943 for l in optionf:
3944 res += shlex.split(l, comments=True)
3945 finally:
3946 optionf.close()
3947 return res
3948
3949 def _format_option_string(option):
3950 ''' ('-o', '--option') -> -o, --format METAVAR'''
3951
3952 opts = []
3953
3954 if option._short_opts: opts.append(option._short_opts[0])
3955 if option._long_opts: opts.append(option._long_opts[0])
3956 if len(opts) > 1: opts.insert(1, ', ')
3957
3958 if option.takes_value(): opts.append(' %s' % option.metavar)
3959
3960 return "".join(opts)
3961
3962 def _find_term_columns():
3963 columns = os.environ.get('COLUMNS', None)
3964 if columns:
3965 return int(columns)
3966
3967 try:
3968 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3969 out,err = sp.communicate()
3970 return int(out.split()[1])
3971 except:
3972 pass
3973 return None
3974
3975 max_width = 80
3976 max_help_position = 80
3977
3978 # No need to wrap help messages if we're on a wide console
3979 columns = _find_term_columns()
3980 if columns: max_width = columns
3981
3982 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3983 fmt.format_option_strings = _format_option_string
3984
3985 kw = {
3986 'version' : __version__,
3987 'formatter' : fmt,
3988 'usage' : '%prog [options] url [url...]',
3989 'conflict_handler' : 'resolve',
3990 }
3991
3992 parser = optparse.OptionParser(**kw)
3993
3994 # option groups
3995 general = optparse.OptionGroup(parser, 'General Options')
3996 selection = optparse.OptionGroup(parser, 'Video Selection')
3997 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3998 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3999 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4000 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4001 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4002
4003 general.add_option('-h', '--help',
4004 action='help', help='print this help text and exit')
4005 general.add_option('-v', '--version',
4006 action='version', help='print program version and exit')
4007 general.add_option('-U', '--update',
4008 action='store_true', dest='update_self', help='update this program to latest version')
4009 general.add_option('-i', '--ignore-errors',
4010 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4011 general.add_option('-r', '--rate-limit',
4012 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4013 general.add_option('-R', '--retries',
4014 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4015 general.add_option('--dump-user-agent',
4016 action='store_true', dest='dump_user_agent',
4017 help='display the current browser identification', default=False)
4018 general.add_option('--list-extractors',
4019 action='store_true', dest='list_extractors',
4020 help='List all supported extractors and the URLs they would handle', default=False)
4021
4022 selection.add_option('--playlist-start',
4023 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4024 selection.add_option('--playlist-end',
4025 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4026 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4027 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4028 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4029
4030 authentication.add_option('-u', '--username',
4031 dest='username', metavar='USERNAME', help='account username')
4032 authentication.add_option('-p', '--password',
4033 dest='password', metavar='PASSWORD', help='account password')
4034 authentication.add_option('-n', '--netrc',
4035 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4036
4037
4038 video_format.add_option('-f', '--format',
4039 action='store', dest='format', metavar='FORMAT', help='video format code')
4040 video_format.add_option('--all-formats',
4041 action='store_const', dest='format', help='download all available video formats', const='all')
4042 video_format.add_option('--max-quality',
4043 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4044 video_format.add_option('-F', '--list-formats',
4045 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4046
4047
4048 verbosity.add_option('-q', '--quiet',
4049 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4050 verbosity.add_option('-s', '--simulate',
4051 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4052 verbosity.add_option('--skip-download',
4053 action='store_true', dest='skip_download', help='do not download the video', default=False)
4054 verbosity.add_option('-g', '--get-url',
4055 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4056 verbosity.add_option('-e', '--get-title',
4057 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4058 verbosity.add_option('--get-thumbnail',
4059 action='store_true', dest='getthumbnail',
4060 help='simulate, quiet but print thumbnail URL', default=False)
4061 verbosity.add_option('--get-description',
4062 action='store_true', dest='getdescription',
4063 help='simulate, quiet but print video description', default=False)
4064 verbosity.add_option('--get-filename',
4065 action='store_true', dest='getfilename',
4066 help='simulate, quiet but print output filename', default=False)
4067 verbosity.add_option('--get-format',
4068 action='store_true', dest='getformat',
4069 help='simulate, quiet but print output format', default=False)
4070 verbosity.add_option('--no-progress',
4071 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4072 verbosity.add_option('--console-title',
4073 action='store_true', dest='consoletitle',
4074 help='display progress in console titlebar', default=False)
4075
4076
4077 filesystem.add_option('-t', '--title',
4078 action='store_true', dest='usetitle', help='use title in file name', default=False)
4079 filesystem.add_option('-l', '--literal',
4080 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4081 filesystem.add_option('-A', '--auto-number',
4082 action='store_true', dest='autonumber',
4083 help='number downloaded files starting from 00000', default=False)
4084 filesystem.add_option('-o', '--output',
4085 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4086 filesystem.add_option('-a', '--batch-file',
4087 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4088 filesystem.add_option('-w', '--no-overwrites',
4089 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4090 filesystem.add_option('-c', '--continue',
4091 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4092 filesystem.add_option('--no-continue',
4093 action='store_false', dest='continue_dl',
4094 help='do not resume partially downloaded files (restart from beginning)')
4095 filesystem.add_option('--cookies',
4096 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4097 filesystem.add_option('--no-part',
4098 action='store_true', dest='nopart', help='do not use .part files', default=False)
4099 filesystem.add_option('--no-mtime',
4100 action='store_false', dest='updatetime',
4101 help='do not use the Last-modified header to set the file modification time', default=True)
4102 filesystem.add_option('--write-description',
4103 action='store_true', dest='writedescription',
4104 help='write video description to a .description file', default=False)
4105 filesystem.add_option('--write-info-json',
4106 action='store_true', dest='writeinfojson',
4107 help='write video metadata to a .info.json file', default=False)
4108
4109
4110 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4111 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4112 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4113 help='"best", "aac", "vorbis" or "mp3"; best by default')
4114 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4115 help='ffmpeg audio bitrate specification, 128k by default')
4116 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4117 help='keeps the video file on disk after the post-processing; the video is erased by default')
4118
4119
4120 parser.add_option_group(general)
4121 parser.add_option_group(selection)
4122 parser.add_option_group(filesystem)
4123 parser.add_option_group(verbosity)
4124 parser.add_option_group(video_format)
4125 parser.add_option_group(authentication)
4126 parser.add_option_group(postproc)
4127
4128 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4129 if xdg_config_home:
4130 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4131 else:
4132 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4133 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4134 opts, args = parser.parse_args(argv)
4135
4136 return parser, opts, args
4137
4138 def gen_extractors():
4139 """ Return a list of an instance of every supported extractor.
4140 The order does matter; the first extractor matched is the one handling the URL.
4141 """
4142 youtube_ie = YoutubeIE()
4143 google_ie = GoogleIE()
4144 yahoo_ie = YahooIE()
4145 return [
4146 YoutubePlaylistIE(youtube_ie),
4147 YoutubeUserIE(youtube_ie),
4148 YoutubeSearchIE(youtube_ie),
4149 youtube_ie,
4150 MetacafeIE(youtube_ie),
4151 DailymotionIE(),
4152 google_ie,
4153 GoogleSearchIE(google_ie),
4154 PhotobucketIE(),
4155 yahoo_ie,
4156 YahooSearchIE(yahoo_ie),
4157 DepositFilesIE(),
4158 FacebookIE(),
4159 BlipTVIE(),
4160 VimeoIE(),
4161 MyVideoIE(),
4162 ComedyCentralIE(),
4163 EscapistIE(),
4164 CollegeHumorIE(),
4165 XVideosIE(),
4166 SoundcloudIE(),
4167 InfoQIE(),
4168 MixcloudIE(),
4169
4170 GenericIE()
4171 ]
4172
4173 def _real_main():
4174 parser, opts, args = parseOpts()
4175
4176 # Open appropriate CookieJar
4177 if opts.cookiefile is None:
4178 jar = cookielib.CookieJar()
4179 else:
4180 try:
4181 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4182 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4183 jar.load()
4184 except (IOError, OSError), err:
4185 sys.exit(u'ERROR: unable to open cookie file')
4186
4187 # Dump user agent
4188 if opts.dump_user_agent:
4189 print std_headers['User-Agent']
4190 sys.exit(0)
4191
4192 # Batch file verification
4193 batchurls = []
4194 if opts.batchfile is not None:
4195 try:
4196 if opts.batchfile == '-':
4197 batchfd = sys.stdin
4198 else:
4199 batchfd = open(opts.batchfile, 'r')
4200 batchurls = batchfd.readlines()
4201 batchurls = [x.strip() for x in batchurls]
4202 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4203 except IOError:
4204 sys.exit(u'ERROR: batch file could not be read')
4205 all_urls = batchurls + args
4206
4207 # General configuration
4208 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4209 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4210 urllib2.install_opener(opener)
4211 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4212
4213 extractors = gen_extractors()
4214
4215 if opts.list_extractors:
4216 for ie in extractors:
4217 print(ie.IE_NAME)
4218 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4219 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4220 for mu in matchedUrls:
4221 print(u' ' + mu)
4222 sys.exit(0)
4223
4224 # Conflicting, missing and erroneous options
4225 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4226 parser.error(u'using .netrc conflicts with giving username/password')
4227 if opts.password is not None and opts.username is None:
4228 parser.error(u'account username missing')
4229 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4230 parser.error(u'using output template conflicts with using title, literal title or auto number')
4231 if opts.usetitle and opts.useliteral:
4232 parser.error(u'using title conflicts with using literal title')
4233 if opts.username is not None and opts.password is None:
4234 opts.password = getpass.getpass(u'Type account password and press return:')
4235 if opts.ratelimit is not None:
4236 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4237 if numeric_limit is None:
4238 parser.error(u'invalid rate limit specified')
4239 opts.ratelimit = numeric_limit
4240 if opts.retries is not None:
4241 try:
4242 opts.retries = long(opts.retries)
4243 except (TypeError, ValueError), err:
4244 parser.error(u'invalid retry count specified')
4245 try:
4246 opts.playliststart = int(opts.playliststart)
4247 if opts.playliststart <= 0:
4248 raise ValueError(u'Playlist start must be positive')
4249 except (TypeError, ValueError), err:
4250 parser.error(u'invalid playlist start number specified')
4251 try:
4252 opts.playlistend = int(opts.playlistend)
4253 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4254 raise ValueError(u'Playlist end must be greater than playlist start')
4255 except (TypeError, ValueError), err:
4256 parser.error(u'invalid playlist end number specified')
4257 if opts.extractaudio:
4258 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4259 parser.error(u'invalid audio format specified')
4260
4261 # File downloader
4262 fd = FileDownloader({
4263 'usenetrc': opts.usenetrc,
4264 'username': opts.username,
4265 'password': opts.password,
4266 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4267 'forceurl': opts.geturl,
4268 'forcetitle': opts.gettitle,
4269 'forcethumbnail': opts.getthumbnail,
4270 'forcedescription': opts.getdescription,
4271 'forcefilename': opts.getfilename,
4272 'forceformat': opts.getformat,
4273 'simulate': opts.simulate,
4274 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4275 'format': opts.format,
4276 'format_limit': opts.format_limit,
4277 'listformats': opts.listformats,
4278 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4279 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4280 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4281 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4282 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4283 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4284 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4285 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4286 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4287 or u'%(id)s.%(ext)s'),
4288 'ignoreerrors': opts.ignoreerrors,
4289 'ratelimit': opts.ratelimit,
4290 'nooverwrites': opts.nooverwrites,
4291 'retries': opts.retries,
4292 'continuedl': opts.continue_dl,
4293 'noprogress': opts.noprogress,
4294 'playliststart': opts.playliststart,
4295 'playlistend': opts.playlistend,
4296 'logtostderr': opts.outtmpl == '-',
4297 'consoletitle': opts.consoletitle,
4298 'nopart': opts.nopart,
4299 'updatetime': opts.updatetime,
4300 'writedescription': opts.writedescription,
4301 'writeinfojson': opts.writeinfojson,
4302 'matchtitle': opts.matchtitle,
4303 'rejecttitle': opts.rejecttitle,
4304 'max_downloads': opts.max_downloads,
4305 })
4306 for extractor in extractors:
4307 fd.add_info_extractor(extractor)
4308
4309 # PostProcessors
4310 if opts.extractaudio:
4311 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4312
4313 # Update version
4314 if opts.update_self:
4315 updateSelf(fd, sys.argv[0])
4316
4317 # Maybe do nothing
4318 if len(all_urls) < 1:
4319 if not opts.update_self:
4320 parser.error(u'you must provide at least one URL')
4321 else:
4322 sys.exit()
4323 retcode = fd.download(all_urls)
4324
4325 # Dump cookie jar if requested
4326 if opts.cookiefile is not None:
4327 try:
4328 jar.save()
4329 except (IOError, OSError), err:
4330 sys.exit(u'ERROR: unable to save cookie jar')
4331
4332 sys.exit(retcode)
4333
4334 def main():
4335 try:
4336 _real_main()
4337 except DownloadError:
4338 sys.exit(1)
4339 except SameFileError:
4340 sys.exit(u'ERROR: fixed output name but more than one file to download')
4341 except KeyboardInterrupt:
4342 sys.exit(u'\nERROR: Interrupted by user')
4343
4344 if __name__ == '__main__':
4345 main()
4346
4347 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: