]> jfr.im git - yt-dlp.git/blob - youtube-dl
Bump version number
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45 import ctypes
46
47 try:
48 import email.utils
49 except ImportError: # Python 2.4
50 import email.Utils
51 try:
52 import cStringIO as StringIO
53 except ImportError:
54 import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58 from urlparse import parse_qs
59 except ImportError:
60 from cgi import parse_qs
61
62 try:
63 import lxml.etree
64 except ImportError:
65 pass # Handled below
66
67 try:
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5
70 pass # Not officially supported, but let it slip
71
72 std_headers = {
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83 import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
195 def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
214
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
256 if filename == u'-':
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
271
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
280
281 class DownloadError(Exception):
282 """Download Error exception.
283
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
286 error message.
287 """
288 pass
289
290
291 class SameFileError(Exception):
292 """Same File exception.
293
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
296 """
297 pass
298
299
300 class PostProcessingError(Exception):
301 """Post Processing exception.
302
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
305 """
306 pass
307
308
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
311
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
314 """
315 pass
316
317
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
320
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
324 """
325 # Both in bytes
326 downloaded = None
327 expected = None
328
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
336
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
343
344 Part of this code was copied from:
345
346 http://techknack.net/python-urllib2-handlers/
347
348 Andrew Rowls, the author of that code, agreed to release it to the
349 public domain.
350 """
351
352 @staticmethod
353 def deflate(data):
354 try:
355 return zlib.decompress(data, -zlib.MAX_WBITS)
356 except zlib.error:
357 return zlib.decompress(data)
358
359 @staticmethod
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
364 ret.code = code
365 return ret
366
367 def http_request(self, req):
368 for h in std_headers:
369 if h in req.headers:
370 del req.headers[h]
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
376 return req
377
378 def http_response(self, req, resp):
379 old_resp = resp
380 # gzip
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
385 # deflate
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
390 return resp
391
392
393 class FileDownloader(object):
394 """File Downloader class.
395
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
402
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
410
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
417
418 Available options:
419
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
449 """
450
451 params = None
452 _ies = []
453 _pps = []
454 _download_retcode = None
455 _num_downloads = None
456 _screen_file = None
457
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
460 self._ies = []
461 self._pps = []
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465 self.params = params
466
467 @staticmethod
468 def format_bytes(bytes):
469 if bytes is None:
470 return 'N/A'
471 if type(bytes) is str:
472 bytes = float(bytes)
473 if bytes == 0.0:
474 exponent = 0
475 else:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
480
481 @staticmethod
482 def calc_percent(byte_counter, data_len):
483 if data_len is None:
484 return '---.-%'
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487 @staticmethod
488 def calc_eta(start, now, total, current):
489 if total is None:
490 return '--:--'
491 dif = now - start
492 if current == 0 or dif < 0.001: # One millisecond
493 return '--:--'
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
497 if eta_mins > 99:
498 return '--:--'
499 return '%02d:%02d' % (eta_mins, eta_secs)
500
501 @staticmethod
502 def calc_speed(start, now, bytes):
503 dif = now - start
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508 @staticmethod
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
513 return long(new_max)
514 rate = bytes / elapsed_time
515 if rate > new_max:
516 return long(new_max)
517 if rate < new_min:
518 return long(new_min)
519 return long(rate)
520
521 @staticmethod
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525 if matchobj is None:
526 return None
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
530
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
533 self._ies.append(ie)
534 ie.set_downloader(self)
535
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
538 self._pps.append(pp)
539 pp.set_downloader(self)
540
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
543 try:
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
550 raise
551
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
555
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
559 return
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
573
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
577 """
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
583
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
588 return
589 now = time.time()
590 elapsed = now - start_time
591 if elapsed <= 0.0:
592 return
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
601 return filename
602 return filename + u'.part'
603
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
607 return filename
608
609 def try_rename(self, old_filename, new_filename):
610 try:
611 if old_filename == new_filename:
612 return
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
616
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
620 return
621 if not os.path.isfile(filename):
622 return
623 timestr = last_modified_hdr
624 if timestr is None:
625 return
626 filetime = timeconvert(timestr)
627 if filetime is None:
628 return
629 try:
630 os.utime(filename, (time.time(), filetime))
631 except:
632 pass
633
634 def report_writedescription(self, descfn):
635 """ Report that the description file is being written """
636 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
637
638 def report_writeinfojson(self, infofn):
639 """ Report that the metadata file has been written """
640 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
641
642 def report_destination(self, filename):
643 """Report destination filename."""
644 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
645
646 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647 """Report download progress."""
648 if self.params.get('noprogress', False):
649 return
650 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
654
655 def report_resuming_byte(self, resume_len):
656 """Report attempt to resume at given byte."""
657 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
658
659 def report_retry(self, count, retries):
660 """Report retry in case of HTTP error 5xx"""
661 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
662
663 def report_file_already_downloaded(self, file_name):
664 """Report file has already been fully downloaded."""
665 try:
666 self.to_screen(u'[download] %s has already been downloaded' % file_name)
667 except (UnicodeEncodeError), err:
668 self.to_screen(u'[download] The file has already been downloaded')
669
670 def report_unable_to_resume(self):
671 """Report it was impossible to resume download."""
672 self.to_screen(u'[download] Unable to resume')
673
674 def report_finish(self):
675 """Report download finished."""
676 if self.params.get('noprogress', False):
677 self.to_screen(u'[download] Download completed')
678 else:
679 self.to_screen(u'')
680
681 def increment_downloads(self):
682 """Increment the ordinal that assigns a number to each file."""
683 self._num_downloads += 1
684
685 def prepare_filename(self, info_dict):
686 """Generate the output filename."""
687 try:
688 template_dict = dict(info_dict)
689 template_dict['epoch'] = unicode(long(time.time()))
690 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691 filename = self.params['outtmpl'] % template_dict
692 return filename
693 except (ValueError, KeyError), err:
694 self.trouble(u'ERROR: invalid system charset or erroneous output template')
695 return None
696
697 def process_info(self, info_dict):
698 """Process a single dictionary returned by an InfoExtractor."""
699 filename = self.prepare_filename(info_dict)
700 # Do nothing else if in simulate mode
701 if self.params.get('simulate', False):
702 # Forced printings
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713
714 return
715
716 if filename is None:
717 return
718
719 matchtitle=self.params.get('matchtitle',False)
720 rejecttitle=self.params.get('rejecttitle',False)
721 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
724 return
725 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
727 return
728
729 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730 self.to_stderr(u'WARNING: file exists and will be skipped')
731 return
732
733 try:
734 dn = os.path.dirname(filename)
735 if dn != '' and not os.path.exists(dn):
736 os.makedirs(dn)
737 except (OSError, IOError), err:
738 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
739 return
740
741 if self.params.get('writedescription', False):
742 try:
743 descfn = filename + '.description'
744 self.report_writedescription(descfn)
745 descfile = open(descfn, 'wb')
746 try:
747 descfile.write(info_dict['description'].encode('utf-8'))
748 finally:
749 descfile.close()
750 except (OSError, IOError):
751 self.trouble(u'ERROR: Cannot write description file ' + descfn)
752 return
753
754 if self.params.get('writeinfojson', False):
755 infofn = filename + '.info.json'
756 self.report_writeinfojson(infofn)
757 try:
758 json.dump
759 except (NameError,AttributeError):
760 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
761 return
762 try:
763 infof = open(infofn, 'wb')
764 try:
765 json.dump(info_dict, infof)
766 finally:
767 infof.close()
768 except (OSError, IOError):
769 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
770 return
771
772 try:
773 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774 except (OSError, IOError), err:
775 raise UnavailableVideoError
776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
778 return
779 except (ContentTooShortError, ), err:
780 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
781 return
782
783 if success:
784 try:
785 self.post_process(filename, info_dict)
786 except (PostProcessingError), err:
787 self.trouble(u'ERROR: postprocessing: %s' % str(err))
788 return
789
790 def download(self, url_list):
791 """Download a given list of URLs."""
792 if len(url_list) > 1 and self.fixed_template():
793 raise SameFileError(self.params['outtmpl'])
794
795 for url in url_list:
796 suitable_found = False
797 for ie in self._ies:
798 # Go to next InfoExtractor if not suitable
799 if not ie.suitable(url):
800 continue
801
802 # Suitable InfoExtractor found
803 suitable_found = True
804
805 # Extract information from URL and process it
806 ie.extract(url)
807
808 # Suitable InfoExtractor had been found; go to next URL
809 break
810
811 if not suitable_found:
812 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
813
814 return self._download_retcode
815
816 def post_process(self, filename, ie_info):
817 """Run the postprocessing chain on the given file."""
818 info = dict(ie_info)
819 info['filepath'] = filename
820 for pp in self._pps:
821 info = pp.run(info)
822 if info is None:
823 break
824
825 def _download_with_rtmpdump(self, filename, url, player_url):
826 self.report_destination(filename)
827 tmpfilename = self.temp_name(filename)
828
829 # Check for rtmpdump first
830 try:
831 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832 except (OSError, IOError):
833 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
834 return False
835
836 # Download using rtmpdump. rtmpdump returns exit code 2 when
837 # the connection was interrumpted and resuming appears to be
838 # possible. This is part of rtmpdump's normal usage, AFAIK.
839 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841 while retval == 2 or retval == 1:
842 prevsize = os.path.getsize(tmpfilename)
843 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844 time.sleep(5.0) # This seems to be needed
845 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846 cursize = os.path.getsize(tmpfilename)
847 if prevsize == cursize and retval == 1:
848 break
849 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850 if prevsize == cursize and retval == 2 and cursize > 1024:
851 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
852 retval = 0
853 break
854 if retval == 0:
855 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856 self.try_rename(tmpfilename, filename)
857 return True
858 else:
859 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
860 return False
861
862 def _do_download(self, filename, url, player_url):
863 # Check file already present
864 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865 self.report_file_already_downloaded(filename)
866 return True
867
868 # Attempt to download using rtmpdump
869 if url.startswith('rtmp'):
870 return self._download_with_rtmpdump(filename, url, player_url)
871
872 tmpfilename = self.temp_name(filename)
873 stream = None
874 open_mode = 'wb'
875
876 # Do not include the Accept-Encoding header
877 headers = {'Youtubedl-no-compression': 'True'}
878 basic_request = urllib2.Request(url, None, headers)
879 request = urllib2.Request(url, None, headers)
880
881 # Establish possible resume length
882 if os.path.isfile(tmpfilename):
883 resume_len = os.path.getsize(tmpfilename)
884 else:
885 resume_len = 0
886
887 # Request parameters in case of being able to resume
888 if self.params.get('continuedl', False) and resume_len != 0:
889 self.report_resuming_byte(resume_len)
890 request.add_header('Range', 'bytes=%d-' % resume_len)
891 open_mode = 'ab'
892
893 count = 0
894 retries = self.params.get('retries', 0)
895 while count <= retries:
896 # Establish connection
897 try:
898 data = urllib2.urlopen(request)
899 break
900 except (urllib2.HTTPError, ), err:
901 if (err.code < 500 or err.code >= 600) and err.code != 416:
902 # Unexpected HTTP error
903 raise
904 elif err.code == 416:
905 # Unable to resume (requested range not satisfiable)
906 try:
907 # Open the connection again without the range header
908 data = urllib2.urlopen(basic_request)
909 content_length = data.info()['Content-Length']
910 except (urllib2.HTTPError, ), err:
911 if err.code < 500 or err.code >= 600:
912 raise
913 else:
914 # Examine the reported length
915 if (content_length is not None and
916 (resume_len - 100 < long(content_length) < resume_len + 100)):
917 # The file had already been fully downloaded.
918 # Explanation to the above condition: in issue #175 it was revealed that
919 # YouTube sometimes adds or removes a few bytes from the end of the file,
920 # changing the file size slightly and causing problems for some users. So
921 # I decided to implement a suggested change and consider the file
922 # completely downloaded if the file size differs less than 100 bytes from
923 # the one in the hard drive.
924 self.report_file_already_downloaded(filename)
925 self.try_rename(tmpfilename, filename)
926 return True
927 else:
928 # The length does not match, we start the download over
929 self.report_unable_to_resume()
930 open_mode = 'wb'
931 break
932 # Retry
933 count += 1
934 if count <= retries:
935 self.report_retry(count, retries)
936
937 if count > retries:
938 self.trouble(u'ERROR: giving up after %s retries' % retries)
939 return False
940
941 data_len = data.info().get('Content-length', None)
942 if data_len is not None:
943 data_len = long(data_len) + resume_len
944 data_len_str = self.format_bytes(data_len)
945 byte_counter = 0 + resume_len
946 block_size = 1024
947 start = time.time()
948 while True:
949 # Download and write
950 before = time.time()
951 data_block = data.read(block_size)
952 after = time.time()
953 if len(data_block) == 0:
954 break
955 byte_counter += len(data_block)
956
957 # Open file just in time
958 if stream is None:
959 try:
960 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961 assert stream is not None
962 filename = self.undo_temp_name(tmpfilename)
963 self.report_destination(filename)
964 except (OSError, IOError), err:
965 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
966 return False
967 try:
968 stream.write(data_block)
969 except (IOError, OSError), err:
970 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
971 return False
972 block_size = self.best_block_size(after - before, len(data_block))
973
974 # Progress message
975 percent_str = self.calc_percent(byte_counter, data_len)
976 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
979
980 # Apply rate limit
981 self.slow_down(start, byte_counter - resume_len)
982
983 if stream is None:
984 self.trouble(u'\nERROR: Did not get any data blocks')
985 return False
986 stream.close()
987 self.report_finish()
988 if data_len is not None and byte_counter != data_len:
989 raise ContentTooShortError(byte_counter, long(data_len))
990 self.try_rename(tmpfilename, filename)
991
992 # Update file modification time
993 if self.params.get('updatetime', True):
994 self.try_utime(filename, data.info().get('last-modified', None))
995
996 return True
997
998
999 class InfoExtractor(object):
1000 """Information Extractor class.
1001
1002 Information extractors are the classes that, given a URL, extract
1003 information from the video (or videos) the URL refers to. This
1004 information includes the real video URL, the video title and simplified
1005 title, author and others. The information is stored in a dictionary
1006 which is then passed to the FileDownloader. The FileDownloader
1007 processes this information possibly downloading the video to the file
1008 system, among other possible outcomes. The dictionaries must include
1009 the following fields:
1010
1011 id: Video identifier.
1012 url: Final video URL.
1013 uploader: Nickname of the video uploader.
1014 title: Literal title.
1015 stitle: Simplified title.
1016 ext: Video filename extension.
1017 format: Video format.
1018 player_url: SWF Player URL (may be None).
1019
1020 The following fields are optional. Their primary purpose is to allow
1021 youtube-dl to serve as the backend for a video search function, such
1022 as the one in youtube2mp3. They are only used when their respective
1023 forced printing functions are called:
1024
1025 thumbnail: Full URL to a video thumbnail image.
1026 description: One-line video description.
1027
1028 Subclasses of this one should re-define the _real_initialize() and
1029 _real_extract() methods, as well as the suitable() static method.
1030 Probably, they should also be instantiated and added to the main
1031 downloader.
1032 """
1033
1034 _ready = False
1035 _downloader = None
1036
1037 def __init__(self, downloader=None):
1038 """Constructor. Receives an optional downloader."""
1039 self._ready = False
1040 self.set_downloader(downloader)
1041
1042 @staticmethod
1043 def suitable(url):
1044 """Receives a URL and returns True if suitable for this IE."""
1045 return False
1046
1047 def initialize(self):
1048 """Initializes an instance (authentication, etc)."""
1049 if not self._ready:
1050 self._real_initialize()
1051 self._ready = True
1052
1053 def extract(self, url):
1054 """Extracts URL information and returns it in list of dicts."""
1055 self.initialize()
1056 return self._real_extract(url)
1057
1058 def set_downloader(self, downloader):
1059 """Sets the downloader for this IE."""
1060 self._downloader = downloader
1061
1062 def _real_initialize(self):
1063 """Real initialization process. Redefine in subclasses."""
1064 pass
1065
1066 def _real_extract(self, url):
1067 """Real extraction process. Redefine in subclasses."""
1068 pass
1069
1070
1071 class YoutubeIE(InfoExtractor):
1072 """Information extractor for youtube.com."""
1073
1074 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1075 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1076 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1077 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1078 _NETRC_MACHINE = 'youtube'
1079 # Listed in order of quality
1080 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1081 _video_extensions = {
1082 '13': '3gp',
1083 '17': 'mp4',
1084 '18': 'mp4',
1085 '22': 'mp4',
1086 '37': 'mp4',
1087 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1088 '43': 'webm',
1089 '45': 'webm',
1090 }
1091
1092 @staticmethod
1093 def suitable(url):
1094 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1095
1096 def report_lang(self):
1097 """Report attempt to set language."""
1098 self._downloader.to_screen(u'[youtube] Setting language')
1099
1100 def report_login(self):
1101 """Report attempt to log in."""
1102 self._downloader.to_screen(u'[youtube] Logging in')
1103
1104 def report_age_confirmation(self):
1105 """Report attempt to confirm age."""
1106 self._downloader.to_screen(u'[youtube] Confirming age')
1107
1108 def report_video_webpage_download(self, video_id):
1109 """Report attempt to download video webpage."""
1110 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1111
1112 def report_video_info_webpage_download(self, video_id):
1113 """Report attempt to download video info webpage."""
1114 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1115
1116 def report_information_extraction(self, video_id):
1117 """Report attempt to extract video information."""
1118 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1119
1120 def report_unavailable_format(self, video_id, format):
1121 """Report extracted video URL."""
1122 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1123
1124 def report_rtmp_download(self):
1125 """Indicate the download will use the RTMP protocol."""
1126 self._downloader.to_screen(u'[youtube] RTMP download detected')
1127
1128 def _real_initialize(self):
1129 if self._downloader is None:
1130 return
1131
1132 username = None
1133 password = None
1134 downloader_params = self._downloader.params
1135
1136 # Attempt to use provided username and password or .netrc data
1137 if downloader_params.get('username', None) is not None:
1138 username = downloader_params['username']
1139 password = downloader_params['password']
1140 elif downloader_params.get('usenetrc', False):
1141 try:
1142 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143 if info is not None:
1144 username = info[0]
1145 password = info[2]
1146 else:
1147 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148 except (IOError, netrc.NetrcParseError), err:
1149 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1150 return
1151
1152 # Set language
1153 request = urllib2.Request(self._LANG_URL)
1154 try:
1155 self.report_lang()
1156 urllib2.urlopen(request).read()
1157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1159 return
1160
1161 # No authentication to be performed
1162 if username is None:
1163 return
1164
1165 # Log in
1166 login_form = {
1167 'current_form': 'loginForm',
1168 'next': '/',
1169 'action_login': 'Log In',
1170 'username': username,
1171 'password': password,
1172 }
1173 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1174 try:
1175 self.report_login()
1176 login_results = urllib2.urlopen(request).read()
1177 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1179 return
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1182 return
1183
1184 # Confirm age
1185 age_form = {
1186 'next_url': '/',
1187 'action_confirm': 'Confirm',
1188 }
1189 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1190 try:
1191 self.report_age_confirmation()
1192 age_results = urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1195 return
1196
1197 def _real_extract(self, url):
1198 # Extract video id from URL
1199 mobj = re.match(self._VALID_URL, url)
1200 if mobj is None:
1201 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1202 return
1203 video_id = mobj.group(2)
1204
1205 # Get video webpage
1206 self.report_video_webpage_download(video_id)
1207 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1208 try:
1209 video_webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1212 return
1213
1214 # Attempt to extract SWF player URL
1215 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216 if mobj is not None:
1217 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1218 else:
1219 player_url = None
1220
1221 # Get video info
1222 self.report_video_info_webpage_download(video_id)
1223 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225 % (video_id, el_type))
1226 request = urllib2.Request(video_info_url)
1227 try:
1228 video_info_webpage = urllib2.urlopen(request).read()
1229 video_info = parse_qs(video_info_webpage)
1230 if 'token' in video_info:
1231 break
1232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1234 return
1235 if 'token' not in video_info:
1236 if 'reason' in video_info:
1237 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1238 else:
1239 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1240 return
1241
1242 # Start extracting information
1243 self.report_information_extraction(video_id)
1244
1245 # uploader
1246 if 'author' not in video_info:
1247 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1248 return
1249 video_uploader = urllib.unquote_plus(video_info['author'][0])
1250
1251 # title
1252 if 'title' not in video_info:
1253 self._downloader.trouble(u'ERROR: unable to extract video title')
1254 return
1255 video_title = urllib.unquote_plus(video_info['title'][0])
1256 video_title = video_title.decode('utf-8')
1257 video_title = sanitize_title(video_title)
1258
1259 # simplified title
1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261 simple_title = simple_title.strip(ur'_')
1262
1263 # thumbnail image
1264 if 'thumbnail_url' not in video_info:
1265 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266 video_thumbnail = ''
1267 else: # don't panic if we can't find it
1268 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1269
1270 # upload date
1271 upload_date = u'NA'
1272 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273 if mobj is not None:
1274 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276 for expression in format_expressions:
1277 try:
1278 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1279 except:
1280 pass
1281
1282 # description
1283 try:
1284 lxml.etree
1285 except NameError:
1286 video_description = u'No description available.'
1287 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289 if mobj is not None:
1290 video_description = mobj.group(1).decode('utf-8')
1291 else:
1292 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295 # TODO use another parser
1296
1297 # token
1298 video_token = urllib.unquote_plus(video_info['token'][0])
1299
1300 # Decide which formats to download
1301 req_format = self._downloader.params.get('format', None)
1302
1303 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304 self.report_rtmp_download()
1305 video_url_list = [(None, video_info['conn'][0])]
1306 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308 url_data = [parse_qs(uds) for uds in url_data_strs]
1309 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1311
1312 format_limit = self._downloader.params.get('format_limit', None)
1313 if format_limit is not None and format_limit in self._available_formats:
1314 format_list = self._available_formats[self._available_formats.index(format_limit):]
1315 else:
1316 format_list = self._available_formats
1317 existing_formats = [x for x in format_list if x in url_map]
1318 if len(existing_formats) == 0:
1319 self._downloader.trouble(u'ERROR: no known formats available for video')
1320 return
1321 if req_format is None:
1322 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323 elif req_format == '-1':
1324 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1325 else:
1326 # Specific format
1327 if req_format not in url_map:
1328 self._downloader.trouble(u'ERROR: requested format not available')
1329 return
1330 video_url_list = [(req_format, url_map[req_format])] # Specific format
1331 else:
1332 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1333 return
1334
1335 for format_param, video_real_url in video_url_list:
1336 # At this point we have a new video
1337 self._downloader.increment_downloads()
1338
1339 # Extension
1340 video_extension = self._video_extensions.get(format_param, 'flv')
1341
1342 try:
1343 # Process video information
1344 self._downloader.process_info({
1345 'id': video_id.decode('utf-8'),
1346 'url': video_real_url.decode('utf-8'),
1347 'uploader': video_uploader.decode('utf-8'),
1348 'upload_date': upload_date,
1349 'title': video_title,
1350 'stitle': simple_title,
1351 'ext': video_extension.decode('utf-8'),
1352 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1353 'thumbnail': video_thumbnail.decode('utf-8'),
1354 'description': video_description,
1355 'player_url': player_url,
1356 })
1357 except UnavailableVideoError, err:
1358 self._downloader.trouble(u'\nERROR: unable to download video')
1359
1360
1361 class MetacafeIE(InfoExtractor):
1362 """Information Extractor for metacafe.com."""
1363
1364 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1365 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1366 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1367 _youtube_ie = None
1368
1369 def __init__(self, youtube_ie, downloader=None):
1370 InfoExtractor.__init__(self, downloader)
1371 self._youtube_ie = youtube_ie
1372
1373 @staticmethod
1374 def suitable(url):
1375 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1376
1377 def report_disclaimer(self):
1378 """Report disclaimer retrieval."""
1379 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1380
1381 def report_age_confirmation(self):
1382 """Report attempt to confirm age."""
1383 self._downloader.to_screen(u'[metacafe] Confirming age')
1384
1385 def report_download_webpage(self, video_id):
1386 """Report webpage download."""
1387 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1388
1389 def report_extraction(self, video_id):
1390 """Report information extraction."""
1391 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1392
1393 def _real_initialize(self):
1394 # Retrieve disclaimer
1395 request = urllib2.Request(self._DISCLAIMER)
1396 try:
1397 self.report_disclaimer()
1398 disclaimer = urllib2.urlopen(request).read()
1399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1401 return
1402
1403 # Confirm age
1404 disclaimer_form = {
1405 'filters': '0',
1406 'submit': "Continue - I'm over 18",
1407 }
1408 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1409 try:
1410 self.report_age_confirmation()
1411 disclaimer = urllib2.urlopen(request).read()
1412 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1413 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1414 return
1415
1416 def _real_extract(self, url):
1417 # Extract id and simplified title from URL
1418 mobj = re.match(self._VALID_URL, url)
1419 if mobj is None:
1420 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1421 return
1422
1423 video_id = mobj.group(1)
1424
1425 # Check if video comes from YouTube
1426 mobj2 = re.match(r'^yt-(.*)$', video_id)
1427 if mobj2 is not None:
1428 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1429 return
1430
1431 # At this point we have a new video
1432 self._downloader.increment_downloads()
1433
1434 simple_title = mobj.group(2).decode('utf-8')
1435
1436 # Retrieve video webpage to extract further information
1437 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1438 try:
1439 self.report_download_webpage(video_id)
1440 webpage = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1443 return
1444
1445 # Extract URL, uploader and title from webpage
1446 self.report_extraction(video_id)
1447 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1448 if mobj is not None:
1449 mediaURL = urllib.unquote(mobj.group(1))
1450 video_extension = mediaURL[-3:]
1451
1452 # Extract gdaKey if available
1453 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1454 if mobj is None:
1455 video_url = mediaURL
1456 else:
1457 gdaKey = mobj.group(1)
1458 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1459 else:
1460 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1461 if mobj is None:
1462 self._downloader.trouble(u'ERROR: unable to extract media URL')
1463 return
1464 vardict = parse_qs(mobj.group(1))
1465 if 'mediaData' not in vardict:
1466 self._downloader.trouble(u'ERROR: unable to extract media URL')
1467 return
1468 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1469 if mobj is None:
1470 self._downloader.trouble(u'ERROR: unable to extract media URL')
1471 return
1472 mediaURL = mobj.group(1).replace('\\/', '/')
1473 video_extension = mediaURL[-3:]
1474 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1475
1476 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1477 if mobj is None:
1478 self._downloader.trouble(u'ERROR: unable to extract title')
1479 return
1480 video_title = mobj.group(1).decode('utf-8')
1481 video_title = sanitize_title(video_title)
1482
1483 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1484 if mobj is None:
1485 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1486 return
1487 video_uploader = mobj.group(1)
1488
1489 try:
1490 # Process video information
1491 self._downloader.process_info({
1492 'id': video_id.decode('utf-8'),
1493 'url': video_url.decode('utf-8'),
1494 'uploader': video_uploader.decode('utf-8'),
1495 'upload_date': u'NA',
1496 'title': video_title,
1497 'stitle': simple_title,
1498 'ext': video_extension.decode('utf-8'),
1499 'format': u'NA',
1500 'player_url': None,
1501 })
1502 except UnavailableVideoError:
1503 self._downloader.trouble(u'\nERROR: unable to download video')
1504
1505
1506 class DailymotionIE(InfoExtractor):
1507 """Information Extractor for Dailymotion"""
1508
1509 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1510
1511 def __init__(self, downloader=None):
1512 InfoExtractor.__init__(self, downloader)
1513
1514 @staticmethod
1515 def suitable(url):
1516 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1517
1518 def report_download_webpage(self, video_id):
1519 """Report webpage download."""
1520 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1521
1522 def report_extraction(self, video_id):
1523 """Report information extraction."""
1524 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1525
1526 def _real_initialize(self):
1527 return
1528
1529 def _real_extract(self, url):
1530 # Extract id and simplified title from URL
1531 mobj = re.match(self._VALID_URL, url)
1532 if mobj is None:
1533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1534 return
1535
1536 # At this point we have a new video
1537 self._downloader.increment_downloads()
1538 video_id = mobj.group(1)
1539
1540 simple_title = mobj.group(2).decode('utf-8')
1541 video_extension = 'flv'
1542
1543 # Retrieve video webpage to extract further information
1544 request = urllib2.Request(url)
1545 request.add_header('Cookie', 'family_filter=off')
1546 try:
1547 self.report_download_webpage(video_id)
1548 webpage = urllib2.urlopen(request).read()
1549 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1550 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1551 return
1552
1553 # Extract URL, uploader and title from webpage
1554 self.report_extraction(video_id)
1555 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1556 if mobj is None:
1557 self._downloader.trouble(u'ERROR: unable to extract media URL')
1558 return
1559 sequence = urllib.unquote(mobj.group(1))
1560 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1561 if mobj is None:
1562 self._downloader.trouble(u'ERROR: unable to extract media URL')
1563 return
1564 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1565
1566 # if needed add http://www.dailymotion.com/ if relative URL
1567
1568 video_url = mediaURL
1569
1570 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1571 if mobj is None:
1572 self._downloader.trouble(u'ERROR: unable to extract title')
1573 return
1574 video_title = mobj.group(1).decode('utf-8')
1575 video_title = sanitize_title(video_title)
1576
1577 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1578 if mobj is None:
1579 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1580 return
1581 video_uploader = mobj.group(1)
1582
1583 try:
1584 # Process video information
1585 self._downloader.process_info({
1586 'id': video_id.decode('utf-8'),
1587 'url': video_url.decode('utf-8'),
1588 'uploader': video_uploader.decode('utf-8'),
1589 'upload_date': u'NA',
1590 'title': video_title,
1591 'stitle': simple_title,
1592 'ext': video_extension.decode('utf-8'),
1593 'format': u'NA',
1594 'player_url': None,
1595 })
1596 except UnavailableVideoError:
1597 self._downloader.trouble(u'\nERROR: unable to download video')
1598
1599
1600 class GoogleIE(InfoExtractor):
1601 """Information extractor for video.google.com."""
1602
1603 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1604
1605 def __init__(self, downloader=None):
1606 InfoExtractor.__init__(self, downloader)
1607
1608 @staticmethod
1609 def suitable(url):
1610 return (re.match(GoogleIE._VALID_URL, url) is not None)
1611
1612 def report_download_webpage(self, video_id):
1613 """Report webpage download."""
1614 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1615
1616 def report_extraction(self, video_id):
1617 """Report information extraction."""
1618 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1619
1620 def _real_initialize(self):
1621 return
1622
1623 def _real_extract(self, url):
1624 # Extract id from URL
1625 mobj = re.match(self._VALID_URL, url)
1626 if mobj is None:
1627 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1628 return
1629
1630 # At this point we have a new video
1631 self._downloader.increment_downloads()
1632 video_id = mobj.group(1)
1633
1634 video_extension = 'mp4'
1635
1636 # Retrieve video webpage to extract further information
1637 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1638 try:
1639 self.report_download_webpage(video_id)
1640 webpage = urllib2.urlopen(request).read()
1641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1643 return
1644
1645 # Extract URL, uploader, and title from webpage
1646 self.report_extraction(video_id)
1647 mobj = re.search(r"download_url:'([^']+)'", webpage)
1648 if mobj is None:
1649 video_extension = 'flv'
1650 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1651 if mobj is None:
1652 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653 return
1654 mediaURL = urllib.unquote(mobj.group(1))
1655 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1656 mediaURL = mediaURL.replace('\\x26', '\x26')
1657
1658 video_url = mediaURL
1659
1660 mobj = re.search(r'<title>(.*)</title>', webpage)
1661 if mobj is None:
1662 self._downloader.trouble(u'ERROR: unable to extract title')
1663 return
1664 video_title = mobj.group(1).decode('utf-8')
1665 video_title = sanitize_title(video_title)
1666 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1667
1668 # Extract video description
1669 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1670 if mobj is None:
1671 self._downloader.trouble(u'ERROR: unable to extract video description')
1672 return
1673 video_description = mobj.group(1).decode('utf-8')
1674 if not video_description:
1675 video_description = 'No description available.'
1676
1677 # Extract video thumbnail
1678 if self._downloader.params.get('forcethumbnail', False):
1679 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1680 try:
1681 webpage = urllib2.urlopen(request).read()
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1684 return
1685 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1686 if mobj is None:
1687 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1688 return
1689 video_thumbnail = mobj.group(1)
1690 else: # we need something to pass to process_info
1691 video_thumbnail = ''
1692
1693 try:
1694 # Process video information
1695 self._downloader.process_info({
1696 'id': video_id.decode('utf-8'),
1697 'url': video_url.decode('utf-8'),
1698 'uploader': u'NA',
1699 'upload_date': u'NA',
1700 'title': video_title,
1701 'stitle': simple_title,
1702 'ext': video_extension.decode('utf-8'),
1703 'format': u'NA',
1704 'player_url': None,
1705 })
1706 except UnavailableVideoError:
1707 self._downloader.trouble(u'\nERROR: unable to download video')
1708
1709
1710 class PhotobucketIE(InfoExtractor):
1711 """Information extractor for photobucket.com."""
1712
1713 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1714
1715 def __init__(self, downloader=None):
1716 InfoExtractor.__init__(self, downloader)
1717
1718 @staticmethod
1719 def suitable(url):
1720 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1721
1722 def report_download_webpage(self, video_id):
1723 """Report webpage download."""
1724 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1725
1726 def report_extraction(self, video_id):
1727 """Report information extraction."""
1728 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1729
1730 def _real_initialize(self):
1731 return
1732
1733 def _real_extract(self, url):
1734 # Extract id from URL
1735 mobj = re.match(self._VALID_URL, url)
1736 if mobj is None:
1737 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1738 return
1739
1740 # At this point we have a new video
1741 self._downloader.increment_downloads()
1742 video_id = mobj.group(1)
1743
1744 video_extension = 'flv'
1745
1746 # Retrieve video webpage to extract further information
1747 request = urllib2.Request(url)
1748 try:
1749 self.report_download_webpage(video_id)
1750 webpage = urllib2.urlopen(request).read()
1751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1753 return
1754
1755 # Extract URL, uploader, and title from webpage
1756 self.report_extraction(video_id)
1757 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1758 if mobj is None:
1759 self._downloader.trouble(u'ERROR: unable to extract media URL')
1760 return
1761 mediaURL = urllib.unquote(mobj.group(1))
1762
1763 video_url = mediaURL
1764
1765 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1766 if mobj is None:
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1768 return
1769 video_title = mobj.group(1).decode('utf-8')
1770 video_title = sanitize_title(video_title)
1771 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1772
1773 video_uploader = mobj.group(2).decode('utf-8')
1774
1775 try:
1776 # Process video information
1777 self._downloader.process_info({
1778 'id': video_id.decode('utf-8'),
1779 'url': video_url.decode('utf-8'),
1780 'uploader': video_uploader,
1781 'upload_date': u'NA',
1782 'title': video_title,
1783 'stitle': simple_title,
1784 'ext': video_extension.decode('utf-8'),
1785 'format': u'NA',
1786 'player_url': None,
1787 })
1788 except UnavailableVideoError:
1789 self._downloader.trouble(u'\nERROR: unable to download video')
1790
1791
1792 class YahooIE(InfoExtractor):
1793 """Information extractor for video.yahoo.com."""
1794
1795 # _VALID_URL matches all Yahoo! Video URLs
1796 # _VPAGE_URL matches only the extractable '/watch/' URLs
1797 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1798 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1799
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1802
1803 @staticmethod
1804 def suitable(url):
1805 return (re.match(YahooIE._VALID_URL, url) is not None)
1806
1807 def report_download_webpage(self, video_id):
1808 """Report webpage download."""
1809 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1810
1811 def report_extraction(self, video_id):
1812 """Report information extraction."""
1813 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1814
1815 def _real_initialize(self):
1816 return
1817
1818 def _real_extract(self, url, new_video=True):
1819 # Extract ID from URL
1820 mobj = re.match(self._VALID_URL, url)
1821 if mobj is None:
1822 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1823 return
1824
1825 # At this point we have a new video
1826 self._downloader.increment_downloads()
1827 video_id = mobj.group(2)
1828 video_extension = 'flv'
1829
1830 # Rewrite valid but non-extractable URLs as
1831 # extractable English language /watch/ URLs
1832 if re.match(self._VPAGE_URL, url) is None:
1833 request = urllib2.Request(url)
1834 try:
1835 webpage = urllib2.urlopen(request).read()
1836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1837 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1838 return
1839
1840 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1841 if mobj is None:
1842 self._downloader.trouble(u'ERROR: Unable to extract id field')
1843 return
1844 yahoo_id = mobj.group(1)
1845
1846 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1847 if mobj is None:
1848 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1849 return
1850 yahoo_vid = mobj.group(1)
1851
1852 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1853 return self._real_extract(url, new_video=False)
1854
1855 # Retrieve video webpage to extract further information
1856 request = urllib2.Request(url)
1857 try:
1858 self.report_download_webpage(video_id)
1859 webpage = urllib2.urlopen(request).read()
1860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1862 return
1863
1864 # Extract uploader and title from webpage
1865 self.report_extraction(video_id)
1866 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1867 if mobj is None:
1868 self._downloader.trouble(u'ERROR: unable to extract video title')
1869 return
1870 video_title = mobj.group(1).decode('utf-8')
1871 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1872
1873 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1874 if mobj is None:
1875 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1876 return
1877 video_uploader = mobj.group(1).decode('utf-8')
1878
1879 # Extract video thumbnail
1880 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1881 if mobj is None:
1882 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1883 return
1884 video_thumbnail = mobj.group(1).decode('utf-8')
1885
1886 # Extract video description
1887 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1888 if mobj is None:
1889 self._downloader.trouble(u'ERROR: unable to extract video description')
1890 return
1891 video_description = mobj.group(1).decode('utf-8')
1892 if not video_description:
1893 video_description = 'No description available.'
1894
1895 # Extract video height and width
1896 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1897 if mobj is None:
1898 self._downloader.trouble(u'ERROR: unable to extract video height')
1899 return
1900 yv_video_height = mobj.group(1)
1901
1902 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1903 if mobj is None:
1904 self._downloader.trouble(u'ERROR: unable to extract video width')
1905 return
1906 yv_video_width = mobj.group(1)
1907
1908 # Retrieve video playlist to extract media URL
1909 # I'm not completely sure what all these options are, but we
1910 # seem to need most of them, otherwise the server sends a 401.
1911 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1912 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1913 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1914 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1915 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1916 try:
1917 self.report_download_webpage(video_id)
1918 webpage = urllib2.urlopen(request).read()
1919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921 return
1922
1923 # Extract media URL from playlist XML
1924 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1925 if mobj is None:
1926 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1927 return
1928 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1929 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1930
1931 try:
1932 # Process video information
1933 self._downloader.process_info({
1934 'id': video_id.decode('utf-8'),
1935 'url': video_url,
1936 'uploader': video_uploader,
1937 'upload_date': u'NA',
1938 'title': video_title,
1939 'stitle': simple_title,
1940 'ext': video_extension.decode('utf-8'),
1941 'thumbnail': video_thumbnail.decode('utf-8'),
1942 'description': video_description,
1943 'thumbnail': video_thumbnail,
1944 'player_url': None,
1945 })
1946 except UnavailableVideoError:
1947 self._downloader.trouble(u'\nERROR: unable to download video')
1948
1949
1950 class VimeoIE(InfoExtractor):
1951 """Information extractor for vimeo.com."""
1952
1953 # _VALID_URL matches Vimeo URLs
1954 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1955
1956 def __init__(self, downloader=None):
1957 InfoExtractor.__init__(self, downloader)
1958
1959 @staticmethod
1960 def suitable(url):
1961 return (re.match(VimeoIE._VALID_URL, url) is not None)
1962
1963 def report_download_webpage(self, video_id):
1964 """Report webpage download."""
1965 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1966
1967 def report_extraction(self, video_id):
1968 """Report information extraction."""
1969 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1970
1971 def _real_initialize(self):
1972 return
1973
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1977 if mobj is None:
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979 return
1980
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(1)
1984
1985 # Retrieve video webpage to extract further information
1986 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1987 try:
1988 self.report_download_webpage(video_id)
1989 webpage = urllib2.urlopen(request).read()
1990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1992 return
1993
1994 # Now we begin extracting as much information as we can from what we
1995 # retrieved. First we extract the information common to all extractors,
1996 # and latter we extract those that are Vimeo specific.
1997 self.report_extraction(video_id)
1998
1999 # Extract title
2000 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2001 if mobj is None:
2002 self._downloader.trouble(u'ERROR: unable to extract video title')
2003 return
2004 video_title = mobj.group(1).decode('utf-8')
2005 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2006
2007 # Extract uploader
2008 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2009 if mobj is None:
2010 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2011 return
2012 video_uploader = mobj.group(1).decode('utf-8')
2013
2014 # Extract video thumbnail
2015 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2016 if mobj is None:
2017 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2018 return
2019 video_thumbnail = mobj.group(1).decode('utf-8')
2020
2021 # # Extract video description
2022 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2023 # if mobj is None:
2024 # self._downloader.trouble(u'ERROR: unable to extract video description')
2025 # return
2026 # video_description = mobj.group(1).decode('utf-8')
2027 # if not video_description: video_description = 'No description available.'
2028 video_description = 'Foo.'
2029
2030 # Vimeo specific: extract request signature
2031 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2032 if mobj is None:
2033 self._downloader.trouble(u'ERROR: unable to extract request signature')
2034 return
2035 sig = mobj.group(1).decode('utf-8')
2036
2037 # Vimeo specific: Extract request signature expiration
2038 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2039 if mobj is None:
2040 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2041 return
2042 sig_exp = mobj.group(1).decode('utf-8')
2043
2044 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2045
2046 try:
2047 # Process video information
2048 self._downloader.process_info({
2049 'id': video_id.decode('utf-8'),
2050 'url': video_url,
2051 'uploader': video_uploader,
2052 'upload_date': u'NA',
2053 'title': video_title,
2054 'stitle': simple_title,
2055 'ext': u'mp4',
2056 'thumbnail': video_thumbnail.decode('utf-8'),
2057 'description': video_description,
2058 'thumbnail': video_thumbnail,
2059 'description': video_description,
2060 'player_url': None,
2061 })
2062 except UnavailableVideoError:
2063 self._downloader.trouble(u'ERROR: unable to download video')
2064
2065
2066 class GenericIE(InfoExtractor):
2067 """Generic last-resort information extractor."""
2068
2069 def __init__(self, downloader=None):
2070 InfoExtractor.__init__(self, downloader)
2071
2072 @staticmethod
2073 def suitable(url):
2074 return True
2075
2076 def report_download_webpage(self, video_id):
2077 """Report webpage download."""
2078 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2079 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2080
2081 def report_extraction(self, video_id):
2082 """Report information extraction."""
2083 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2084
2085 def _real_initialize(self):
2086 return
2087
2088 def _real_extract(self, url):
2089 # At this point we have a new video
2090 self._downloader.increment_downloads()
2091
2092 video_id = url.split('/')[-1]
2093 request = urllib2.Request(url)
2094 try:
2095 self.report_download_webpage(video_id)
2096 webpage = urllib2.urlopen(request).read()
2097 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2098 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2099 return
2100 except ValueError, err:
2101 # since this is the last-resort InfoExtractor, if
2102 # this error is thrown, it'll be thrown here
2103 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2104 return
2105
2106 self.report_extraction(video_id)
2107 # Start with something easy: JW Player in SWFObject
2108 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2109 if mobj is None:
2110 # Broaden the search a little bit
2111 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2112 if mobj is None:
2113 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2114 return
2115
2116 # It's possible that one of the regexes
2117 # matched, but returned an empty group:
2118 if mobj.group(1) is None:
2119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120 return
2121
2122 video_url = urllib.unquote(mobj.group(1))
2123 video_id = os.path.basename(video_url)
2124
2125 # here's a fun little line of code for you:
2126 video_extension = os.path.splitext(video_id)[1][1:]
2127 video_id = os.path.splitext(video_id)[0]
2128
2129 # it's tempting to parse this further, but you would
2130 # have to take into account all the variations like
2131 # Video Title - Site Name
2132 # Site Name | Video Title
2133 # Video Title - Tagline | Site Name
2134 # and so on and so forth; it's just not practical
2135 mobj = re.search(r'<title>(.*)</title>', webpage)
2136 if mobj is None:
2137 self._downloader.trouble(u'ERROR: unable to extract title')
2138 return
2139 video_title = mobj.group(1).decode('utf-8')
2140 video_title = sanitize_title(video_title)
2141 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2142
2143 # video uploader is domain name
2144 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2145 if mobj is None:
2146 self._downloader.trouble(u'ERROR: unable to extract title')
2147 return
2148 video_uploader = mobj.group(1).decode('utf-8')
2149
2150 try:
2151 # Process video information
2152 self._downloader.process_info({
2153 'id': video_id.decode('utf-8'),
2154 'url': video_url.decode('utf-8'),
2155 'uploader': video_uploader,
2156 'upload_date': u'NA',
2157 'title': video_title,
2158 'stitle': simple_title,
2159 'ext': video_extension.decode('utf-8'),
2160 'format': u'NA',
2161 'player_url': None,
2162 })
2163 except UnavailableVideoError, err:
2164 self._downloader.trouble(u'\nERROR: unable to download video')
2165
2166
2167 class YoutubeSearchIE(InfoExtractor):
2168 """Information Extractor for YouTube search queries."""
2169 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2170 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2171 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2172 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2173 _youtube_ie = None
2174 _max_youtube_results = 1000
2175
2176 def __init__(self, youtube_ie, downloader=None):
2177 InfoExtractor.__init__(self, downloader)
2178 self._youtube_ie = youtube_ie
2179
2180 @staticmethod
2181 def suitable(url):
2182 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2183
2184 def report_download_page(self, query, pagenum):
2185 """Report attempt to download playlist page with given number."""
2186 query = query.decode(preferredencoding())
2187 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2188
2189 def _real_initialize(self):
2190 self._youtube_ie.initialize()
2191
2192 def _real_extract(self, query):
2193 mobj = re.match(self._VALID_QUERY, query)
2194 if mobj is None:
2195 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2196 return
2197
2198 prefix, query = query.split(':')
2199 prefix = prefix[8:]
2200 query = query.encode('utf-8')
2201 if prefix == '':
2202 self._download_n_results(query, 1)
2203 return
2204 elif prefix == 'all':
2205 self._download_n_results(query, self._max_youtube_results)
2206 return
2207 else:
2208 try:
2209 n = long(prefix)
2210 if n <= 0:
2211 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2212 return
2213 elif n > self._max_youtube_results:
2214 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2215 n = self._max_youtube_results
2216 self._download_n_results(query, n)
2217 return
2218 except ValueError: # parsing prefix as integer fails
2219 self._download_n_results(query, 1)
2220 return
2221
2222 def _download_n_results(self, query, n):
2223 """Downloads a specified number of results for a query"""
2224
2225 video_ids = []
2226 already_seen = set()
2227 pagenum = 1
2228
2229 while True:
2230 self.report_download_page(query, pagenum)
2231 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2232 request = urllib2.Request(result_url)
2233 try:
2234 page = urllib2.urlopen(request).read()
2235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2236 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2237 return
2238
2239 # Extract video identifiers
2240 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2241 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2242 if video_id not in already_seen:
2243 video_ids.append(video_id)
2244 already_seen.add(video_id)
2245 if len(video_ids) == n:
2246 # Specified n videos reached
2247 for id in video_ids:
2248 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2249 return
2250
2251 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2252 for id in video_ids:
2253 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2254 return
2255
2256 pagenum = pagenum + 1
2257
2258
2259 class GoogleSearchIE(InfoExtractor):
2260 """Information Extractor for Google Video search queries."""
2261 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2262 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2263 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2264 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2265 _google_ie = None
2266 _max_google_results = 1000
2267
2268 def __init__(self, google_ie, downloader=None):
2269 InfoExtractor.__init__(self, downloader)
2270 self._google_ie = google_ie
2271
2272 @staticmethod
2273 def suitable(url):
2274 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2275
2276 def report_download_page(self, query, pagenum):
2277 """Report attempt to download playlist page with given number."""
2278 query = query.decode(preferredencoding())
2279 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2280
2281 def _real_initialize(self):
2282 self._google_ie.initialize()
2283
2284 def _real_extract(self, query):
2285 mobj = re.match(self._VALID_QUERY, query)
2286 if mobj is None:
2287 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2288 return
2289
2290 prefix, query = query.split(':')
2291 prefix = prefix[8:]
2292 query = query.encode('utf-8')
2293 if prefix == '':
2294 self._download_n_results(query, 1)
2295 return
2296 elif prefix == 'all':
2297 self._download_n_results(query, self._max_google_results)
2298 return
2299 else:
2300 try:
2301 n = long(prefix)
2302 if n <= 0:
2303 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2304 return
2305 elif n > self._max_google_results:
2306 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2307 n = self._max_google_results
2308 self._download_n_results(query, n)
2309 return
2310 except ValueError: # parsing prefix as integer fails
2311 self._download_n_results(query, 1)
2312 return
2313
2314 def _download_n_results(self, query, n):
2315 """Downloads a specified number of results for a query"""
2316
2317 video_ids = []
2318 already_seen = set()
2319 pagenum = 1
2320
2321 while True:
2322 self.report_download_page(query, pagenum)
2323 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2324 request = urllib2.Request(result_url)
2325 try:
2326 page = urllib2.urlopen(request).read()
2327 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2328 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2329 return
2330
2331 # Extract video identifiers
2332 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2333 video_id = mobj.group(1)
2334 if video_id not in already_seen:
2335 video_ids.append(video_id)
2336 already_seen.add(video_id)
2337 if len(video_ids) == n:
2338 # Specified n videos reached
2339 for id in video_ids:
2340 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2341 return
2342
2343 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2344 for id in video_ids:
2345 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2346 return
2347
2348 pagenum = pagenum + 1
2349
2350
2351 class YahooSearchIE(InfoExtractor):
2352 """Information Extractor for Yahoo! Video search queries."""
2353 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2354 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2355 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2356 _MORE_PAGES_INDICATOR = r'\s*Next'
2357 _yahoo_ie = None
2358 _max_yahoo_results = 1000
2359
2360 def __init__(self, yahoo_ie, downloader=None):
2361 InfoExtractor.__init__(self, downloader)
2362 self._yahoo_ie = yahoo_ie
2363
2364 @staticmethod
2365 def suitable(url):
2366 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2367
2368 def report_download_page(self, query, pagenum):
2369 """Report attempt to download playlist page with given number."""
2370 query = query.decode(preferredencoding())
2371 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2372
2373 def _real_initialize(self):
2374 self._yahoo_ie.initialize()
2375
2376 def _real_extract(self, query):
2377 mobj = re.match(self._VALID_QUERY, query)
2378 if mobj is None:
2379 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2380 return
2381
2382 prefix, query = query.split(':')
2383 prefix = prefix[8:]
2384 query = query.encode('utf-8')
2385 if prefix == '':
2386 self._download_n_results(query, 1)
2387 return
2388 elif prefix == 'all':
2389 self._download_n_results(query, self._max_yahoo_results)
2390 return
2391 else:
2392 try:
2393 n = long(prefix)
2394 if n <= 0:
2395 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2396 return
2397 elif n > self._max_yahoo_results:
2398 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2399 n = self._max_yahoo_results
2400 self._download_n_results(query, n)
2401 return
2402 except ValueError: # parsing prefix as integer fails
2403 self._download_n_results(query, 1)
2404 return
2405
2406 def _download_n_results(self, query, n):
2407 """Downloads a specified number of results for a query"""
2408
2409 video_ids = []
2410 already_seen = set()
2411 pagenum = 1
2412
2413 while True:
2414 self.report_download_page(query, pagenum)
2415 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2416 request = urllib2.Request(result_url)
2417 try:
2418 page = urllib2.urlopen(request).read()
2419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2420 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2421 return
2422
2423 # Extract video identifiers
2424 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2425 video_id = mobj.group(1)
2426 if video_id not in already_seen:
2427 video_ids.append(video_id)
2428 already_seen.add(video_id)
2429 if len(video_ids) == n:
2430 # Specified n videos reached
2431 for id in video_ids:
2432 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2433 return
2434
2435 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2436 for id in video_ids:
2437 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2438 return
2439
2440 pagenum = pagenum + 1
2441
2442
2443 class YoutubePlaylistIE(InfoExtractor):
2444 """Information Extractor for YouTube playlists."""
2445
2446 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2447 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2448 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2449 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2450 _youtube_ie = None
2451
2452 def __init__(self, youtube_ie, downloader=None):
2453 InfoExtractor.__init__(self, downloader)
2454 self._youtube_ie = youtube_ie
2455
2456 @staticmethod
2457 def suitable(url):
2458 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2459
2460 def report_download_page(self, playlist_id, pagenum):
2461 """Report attempt to download playlist page with given number."""
2462 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2463
2464 def _real_initialize(self):
2465 self._youtube_ie.initialize()
2466
2467 def _real_extract(self, url):
2468 # Extract playlist id
2469 mobj = re.match(self._VALID_URL, url)
2470 if mobj is None:
2471 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2472 return
2473
2474 # Single video case
2475 if mobj.group(3) is not None:
2476 self._youtube_ie.extract(mobj.group(3))
2477 return
2478
2479 # Download playlist pages
2480 # prefix is 'p' as default for playlists but there are other types that need extra care
2481 playlist_prefix = mobj.group(1)
2482 if playlist_prefix == 'a':
2483 playlist_access = 'artist'
2484 else:
2485 playlist_prefix = 'p'
2486 playlist_access = 'view_play_list'
2487 playlist_id = mobj.group(2)
2488 video_ids = []
2489 pagenum = 1
2490
2491 while True:
2492 self.report_download_page(playlist_id, pagenum)
2493 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2494 try:
2495 page = urllib2.urlopen(request).read()
2496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2497 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2498 return
2499
2500 # Extract video identifiers
2501 ids_in_page = []
2502 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2503 if mobj.group(1) not in ids_in_page:
2504 ids_in_page.append(mobj.group(1))
2505 video_ids.extend(ids_in_page)
2506
2507 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2508 break
2509 pagenum = pagenum + 1
2510
2511 playliststart = self._downloader.params.get('playliststart', 1) - 1
2512 playlistend = self._downloader.params.get('playlistend', -1)
2513 video_ids = video_ids[playliststart:playlistend]
2514
2515 for id in video_ids:
2516 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2517 return
2518
2519
2520 class YoutubeUserIE(InfoExtractor):
2521 """Information Extractor for YouTube users."""
2522
2523 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2524 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2525 _GDATA_PAGE_SIZE = 50
2526 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2527 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2528 _youtube_ie = None
2529
2530 def __init__(self, youtube_ie, downloader=None):
2531 InfoExtractor.__init__(self, downloader)
2532 self._youtube_ie = youtube_ie
2533
2534 @staticmethod
2535 def suitable(url):
2536 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2537
2538 def report_download_page(self, username, start_index):
2539 """Report attempt to download user page."""
2540 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2541 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2542
2543 def _real_initialize(self):
2544 self._youtube_ie.initialize()
2545
2546 def _real_extract(self, url):
2547 # Extract username
2548 mobj = re.match(self._VALID_URL, url)
2549 if mobj is None:
2550 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2551 return
2552
2553 username = mobj.group(1)
2554
2555 # Download video ids using YouTube Data API. Result size per
2556 # query is limited (currently to 50 videos) so we need to query
2557 # page by page until there are no video ids - it means we got
2558 # all of them.
2559
2560 video_ids = []
2561 pagenum = 0
2562
2563 while True:
2564 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2565 self.report_download_page(username, start_index)
2566
2567 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2568
2569 try:
2570 page = urllib2.urlopen(request).read()
2571 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2572 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2573 return
2574
2575 # Extract video identifiers
2576 ids_in_page = []
2577
2578 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2579 if mobj.group(1) not in ids_in_page:
2580 ids_in_page.append(mobj.group(1))
2581
2582 video_ids.extend(ids_in_page)
2583
2584 # A little optimization - if current page is not
2585 # "full", ie. does not contain PAGE_SIZE video ids then
2586 # we can assume that this page is the last one - there
2587 # are no more ids on further pages - no need to query
2588 # again.
2589
2590 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2591 break
2592
2593 pagenum += 1
2594
2595 all_ids_count = len(video_ids)
2596 playliststart = self._downloader.params.get('playliststart', 1) - 1
2597 playlistend = self._downloader.params.get('playlistend', -1)
2598
2599 if playlistend == -1:
2600 video_ids = video_ids[playliststart:]
2601 else:
2602 video_ids = video_ids[playliststart:playlistend]
2603
2604 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2605 (username, all_ids_count, len(video_ids)))
2606
2607 for video_id in video_ids:
2608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2609
2610
2611 class DepositFilesIE(InfoExtractor):
2612 """Information extractor for depositfiles.com"""
2613
2614 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2615
2616 def __init__(self, downloader=None):
2617 InfoExtractor.__init__(self, downloader)
2618
2619 @staticmethod
2620 def suitable(url):
2621 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2622
2623 def report_download_webpage(self, file_id):
2624 """Report webpage download."""
2625 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2626
2627 def report_extraction(self, file_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2630
2631 def _real_initialize(self):
2632 return
2633
2634 def _real_extract(self, url):
2635 # At this point we have a new file
2636 self._downloader.increment_downloads()
2637
2638 file_id = url.split('/')[-1]
2639 # Rebuild url in english locale
2640 url = 'http://depositfiles.com/en/files/' + file_id
2641
2642 # Retrieve file webpage with 'Free download' button pressed
2643 free_download_indication = { 'gateway_result' : '1' }
2644 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2645 try:
2646 self.report_download_webpage(file_id)
2647 webpage = urllib2.urlopen(request).read()
2648 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2649 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2650 return
2651
2652 # Search for the real file URL
2653 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2654 if (mobj is None) or (mobj.group(1) is None):
2655 # Try to figure out reason of the error.
2656 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2657 if (mobj is not None) and (mobj.group(1) is not None):
2658 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2659 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2660 else:
2661 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2662 return
2663
2664 file_url = mobj.group(1)
2665 file_extension = os.path.splitext(file_url)[1][1:]
2666
2667 # Search for file title
2668 mobj = re.search(r'<b title="(.*?)">', webpage)
2669 if mobj is None:
2670 self._downloader.trouble(u'ERROR: unable to extract title')
2671 return
2672 file_title = mobj.group(1).decode('utf-8')
2673
2674 try:
2675 # Process file information
2676 self._downloader.process_info({
2677 'id': file_id.decode('utf-8'),
2678 'url': file_url.decode('utf-8'),
2679 'uploader': u'NA',
2680 'upload_date': u'NA',
2681 'title': file_title,
2682 'stitle': file_title,
2683 'ext': file_extension.decode('utf-8'),
2684 'format': u'NA',
2685 'player_url': None,
2686 })
2687 except UnavailableVideoError, err:
2688 self._downloader.trouble(u'ERROR: unable to download file')
2689
2690
2691 class FacebookIE(InfoExtractor):
2692 """Information Extractor for Facebook"""
2693
2694 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2695 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2696 _NETRC_MACHINE = 'facebook'
2697 _available_formats = ['highqual', 'lowqual']
2698 _video_extensions = {
2699 'highqual': 'mp4',
2700 'lowqual': 'mp4',
2701 }
2702
2703 def __init__(self, downloader=None):
2704 InfoExtractor.__init__(self, downloader)
2705
2706 @staticmethod
2707 def suitable(url):
2708 return (re.match(FacebookIE._VALID_URL, url) is not None)
2709
2710 def _reporter(self, message):
2711 """Add header and report message."""
2712 self._downloader.to_screen(u'[facebook] %s' % message)
2713
2714 def report_login(self):
2715 """Report attempt to log in."""
2716 self._reporter(u'Logging in')
2717
2718 def report_video_webpage_download(self, video_id):
2719 """Report attempt to download video webpage."""
2720 self._reporter(u'%s: Downloading video webpage' % video_id)
2721
2722 def report_information_extraction(self, video_id):
2723 """Report attempt to extract video information."""
2724 self._reporter(u'%s: Extracting video information' % video_id)
2725
2726 def _parse_page(self, video_webpage):
2727 """Extract video information from page"""
2728 # General data
2729 data = {'title': r'class="video_title datawrap">(.*?)</',
2730 'description': r'<div class="datawrap">(.*?)</div>',
2731 'owner': r'\("video_owner_name", "(.*?)"\)',
2732 'upload_date': r'data-date="(.*?)"',
2733 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2734 }
2735 video_info = {}
2736 for piece in data.keys():
2737 mobj = re.search(data[piece], video_webpage)
2738 if mobj is not None:
2739 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2740
2741 # Video urls
2742 video_urls = {}
2743 for fmt in self._available_formats:
2744 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2745 if mobj is not None:
2746 # URL is in a Javascript segment inside an escaped Unicode format within
2747 # the generally utf-8 page
2748 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2749 video_info['video_urls'] = video_urls
2750
2751 return video_info
2752
2753 def _real_initialize(self):
2754 if self._downloader is None:
2755 return
2756
2757 useremail = None
2758 password = None
2759 downloader_params = self._downloader.params
2760
2761 # Attempt to use provided username and password or .netrc data
2762 if downloader_params.get('username', None) is not None:
2763 useremail = downloader_params['username']
2764 password = downloader_params['password']
2765 elif downloader_params.get('usenetrc', False):
2766 try:
2767 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2768 if info is not None:
2769 useremail = info[0]
2770 password = info[2]
2771 else:
2772 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2773 except (IOError, netrc.NetrcParseError), err:
2774 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2775 return
2776
2777 if useremail is None:
2778 return
2779
2780 # Log in
2781 login_form = {
2782 'email': useremail,
2783 'pass': password,
2784 'login': 'Log+In'
2785 }
2786 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2787 try:
2788 self.report_login()
2789 login_results = urllib2.urlopen(request).read()
2790 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2791 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2792 return
2793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2795 return
2796
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2799 if mobj is None:
2800 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801 return
2802 video_id = mobj.group('ID')
2803
2804 # Get video webpage
2805 self.report_video_webpage_download(video_id)
2806 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2807 try:
2808 page = urllib2.urlopen(request)
2809 video_webpage = page.read()
2810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2812 return
2813
2814 # Start extracting information
2815 self.report_information_extraction(video_id)
2816
2817 # Extract information
2818 video_info = self._parse_page(video_webpage)
2819
2820 # uploader
2821 if 'owner' not in video_info:
2822 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2823 return
2824 video_uploader = video_info['owner']
2825
2826 # title
2827 if 'title' not in video_info:
2828 self._downloader.trouble(u'ERROR: unable to extract video title')
2829 return
2830 video_title = video_info['title']
2831 video_title = video_title.decode('utf-8')
2832 video_title = sanitize_title(video_title)
2833
2834 # simplified title
2835 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2836 simple_title = simple_title.strip(ur'_')
2837
2838 # thumbnail image
2839 if 'thumbnail' not in video_info:
2840 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2841 video_thumbnail = ''
2842 else:
2843 video_thumbnail = video_info['thumbnail']
2844
2845 # upload date
2846 upload_date = u'NA'
2847 if 'upload_date' in video_info:
2848 upload_time = video_info['upload_date']
2849 timetuple = email.utils.parsedate_tz(upload_time)
2850 if timetuple is not None:
2851 try:
2852 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2853 except:
2854 pass
2855
2856 # description
2857 video_description = video_info.get('description', 'No description available.')
2858
2859 url_map = video_info['video_urls']
2860 if len(url_map.keys()) > 0:
2861 # Decide which formats to download
2862 req_format = self._downloader.params.get('format', None)
2863 format_limit = self._downloader.params.get('format_limit', None)
2864
2865 if format_limit is not None and format_limit in self._available_formats:
2866 format_list = self._available_formats[self._available_formats.index(format_limit):]
2867 else:
2868 format_list = self._available_formats
2869 existing_formats = [x for x in format_list if x in url_map]
2870 if len(existing_formats) == 0:
2871 self._downloader.trouble(u'ERROR: no known formats available for video')
2872 return
2873 if req_format is None:
2874 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2875 elif req_format == '-1':
2876 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2877 else:
2878 # Specific format
2879 if req_format not in url_map:
2880 self._downloader.trouble(u'ERROR: requested format not available')
2881 return
2882 video_url_list = [(req_format, url_map[req_format])] # Specific format
2883
2884 for format_param, video_real_url in video_url_list:
2885
2886 # At this point we have a new video
2887 self._downloader.increment_downloads()
2888
2889 # Extension
2890 video_extension = self._video_extensions.get(format_param, 'mp4')
2891
2892 try:
2893 # Process video information
2894 self._downloader.process_info({
2895 'id': video_id.decode('utf-8'),
2896 'url': video_real_url.decode('utf-8'),
2897 'uploader': video_uploader.decode('utf-8'),
2898 'upload_date': upload_date,
2899 'title': video_title,
2900 'stitle': simple_title,
2901 'ext': video_extension.decode('utf-8'),
2902 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2903 'thumbnail': video_thumbnail.decode('utf-8'),
2904 'description': video_description.decode('utf-8'),
2905 'player_url': None,
2906 })
2907 except UnavailableVideoError, err:
2908 self._downloader.trouble(u'\nERROR: unable to download video')
2909
2910 class BlipTVIE(InfoExtractor):
2911 """Information extractor for blip.tv"""
2912
2913 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2914 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2915
2916 @staticmethod
2917 def suitable(url):
2918 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2919
2920 def report_extraction(self, file_id):
2921 """Report information extraction."""
2922 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2923
2924 def _simplify_title(self, title):
2925 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2926 res = res.strip(ur'_')
2927 return res
2928
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2931 if mobj is None:
2932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933 return
2934
2935 if '?' in url:
2936 cchar = '&'
2937 else:
2938 cchar = '?'
2939 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2940 request = urllib2.Request(json_url)
2941 self.report_extraction(mobj.group(1))
2942 try:
2943 json_code = urllib2.urlopen(request).read()
2944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2945 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2946 return
2947 try:
2948 json_data = json.loads(json_code)
2949 if 'Post' in json_data:
2950 data = json_data['Post']
2951 else:
2952 data = json_data
2953
2954 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2955 video_url = data['media']['url']
2956 umobj = re.match(self._URL_EXT, video_url)
2957 if umobj is None:
2958 raise ValueError('Can not determine filename extension')
2959 ext = umobj.group(1)
2960
2961 self._downloader.increment_downloads()
2962
2963 info = {
2964 'id': data['item_id'],
2965 'url': video_url,
2966 'uploader': data['display_name'],
2967 'upload_date': upload_date,
2968 'title': data['title'],
2969 'stitle': self._simplify_title(data['title']),
2970 'ext': ext,
2971 'format': data['media']['mimeType'],
2972 'thumbnail': data['thumbnailUrl'],
2973 'description': data['description'],
2974 'player_url': data['embedUrl']
2975 }
2976 except (ValueError,KeyError), err:
2977 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2978 return
2979
2980 try:
2981 self._downloader.process_info(info)
2982 except UnavailableVideoError, err:
2983 self._downloader.trouble(u'\nERROR: unable to download video')
2984
2985
2986 class MyVideoIE(InfoExtractor):
2987 """Information Extractor for myvideo.de."""
2988
2989 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2990
2991 def __init__(self, downloader=None):
2992 InfoExtractor.__init__(self, downloader)
2993
2994 @staticmethod
2995 def suitable(url):
2996 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2997
2998 def report_download_webpage(self, video_id):
2999 """Report webpage download."""
3000 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3001
3002 def report_extraction(self, video_id):
3003 """Report information extraction."""
3004 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3005
3006 def _real_initialize(self):
3007 return
3008
3009 def _real_extract(self,url):
3010 mobj = re.match(self._VALID_URL, url)
3011 if mobj is None:
3012 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3013 return
3014
3015 video_id = mobj.group(1)
3016 simple_title = mobj.group(2).decode('utf-8')
3017 # should actually not be necessary
3018 simple_title = sanitize_title(simple_title)
3019 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3020
3021 # Get video webpage
3022 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3023 try:
3024 self.report_download_webpage(video_id)
3025 webpage = urllib2.urlopen(request).read()
3026 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3027 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3028 return
3029
3030 self.report_extraction(video_id)
3031 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3032 webpage)
3033 if mobj is None:
3034 self._downloader.trouble(u'ERROR: unable to extract media URL')
3035 return
3036 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3037
3038 mobj = re.search('<title>([^<]+)</title>', webpage)
3039 if mobj is None:
3040 self._downloader.trouble(u'ERROR: unable to extract title')
3041 return
3042
3043 video_title = mobj.group(1)
3044 video_title = sanitize_title(video_title)
3045
3046 try:
3047 print(video_url)
3048 self._downloader.process_info({
3049 'id': video_id,
3050 'url': video_url,
3051 'uploader': u'NA',
3052 'upload_date': u'NA',
3053 'title': video_title,
3054 'stitle': simple_title,
3055 'ext': u'flv',
3056 'format': u'NA',
3057 'player_url': None,
3058 })
3059 except UnavailableVideoError:
3060 self._downloader.trouble(u'\nERROR: Unable to download video')
3061
3062 class ComedyCentralIE(InfoExtractor):
3063 """Information extractor for The Daily Show and Colbert Report """
3064
3065 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3066
3067 @staticmethod
3068 def suitable(url):
3069 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3070
3071 def report_extraction(self, episode_id):
3072 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3073
3074 def report_config_download(self, episode_id):
3075 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3076
3077 def report_index_download(self, episode_id):
3078 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3079
3080 def report_player_url(self, episode_id):
3081 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3082
3083 def _simplify_title(self, title):
3084 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3085 res = res.strip(ur'_')
3086 return res
3087
3088 def _real_extract(self, url):
3089 mobj = re.match(self._VALID_URL, url)
3090 if mobj is None:
3091 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3092 return
3093
3094 if mobj.group('shortname'):
3095 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3096 url = 'http://www.thedailyshow.com/full-episodes/'
3097 else:
3098 url = 'http://www.colbertnation.com/full-episodes/'
3099 mobj = re.match(self._VALID_URL, url)
3100 assert mobj is not None
3101
3102 dlNewest = not mobj.group('episode')
3103 if dlNewest:
3104 epTitle = mobj.group('showname')
3105 else:
3106 epTitle = mobj.group('episode')
3107
3108 req = urllib2.Request(url)
3109 self.report_extraction(epTitle)
3110 try:
3111 htmlHandle = urllib2.urlopen(req)
3112 html = htmlHandle.read()
3113 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3115 return
3116 if dlNewest:
3117 url = htmlHandle.geturl()
3118 mobj = re.match(self._VALID_URL, url)
3119 if mobj is None:
3120 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3121 return
3122 if mobj.group('episode') == '':
3123 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3124 return
3125 epTitle = mobj.group('episode')
3126
3127 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3128 if len(mMovieParams) == 0:
3129 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3130 return
3131
3132 playerUrl_raw = mMovieParams[0][0]
3133 self.report_player_url(epTitle)
3134 try:
3135 urlHandle = urllib2.urlopen(playerUrl_raw)
3136 playerUrl = urlHandle.geturl()
3137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3138 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3139 return
3140
3141 uri = mMovieParams[0][1]
3142 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3143 self.report_index_download(epTitle)
3144 try:
3145 indexXml = urllib2.urlopen(indexUrl).read()
3146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3147 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3148 return
3149
3150 idoc = xml.etree.ElementTree.fromstring(indexXml)
3151 itemEls = idoc.findall('.//item')
3152 for itemEl in itemEls:
3153 mediaId = itemEl.findall('./guid')[0].text
3154 shortMediaId = mediaId.split(':')[-1]
3155 showId = mediaId.split(':')[-2].replace('.com', '')
3156 officialTitle = itemEl.findall('./title')[0].text
3157 officialDate = itemEl.findall('./pubDate')[0].text
3158
3159 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3160 urllib.urlencode({'uri': mediaId}))
3161 configReq = urllib2.Request(configUrl)
3162 self.report_config_download(epTitle)
3163 try:
3164 configXml = urllib2.urlopen(configReq).read()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3167 return
3168
3169 cdoc = xml.etree.ElementTree.fromstring(configXml)
3170 turls = []
3171 for rendition in cdoc.findall('.//rendition'):
3172 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3173 turls.append(finfo)
3174
3175 if len(turls) == 0:
3176 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3177 continue
3178
3179 # For now, just pick the highest bitrate
3180 format,video_url = turls[-1]
3181
3182 self._downloader.increment_downloads()
3183
3184 effTitle = showId + '-' + epTitle
3185 info = {
3186 'id': shortMediaId,
3187 'url': video_url,
3188 'uploader': showId,
3189 'upload_date': officialDate,
3190 'title': effTitle,
3191 'stitle': self._simplify_title(effTitle),
3192 'ext': 'mp4',
3193 'format': format,
3194 'thumbnail': None,
3195 'description': officialTitle,
3196 'player_url': playerUrl
3197 }
3198
3199 try:
3200 self._downloader.process_info(info)
3201 except UnavailableVideoError, err:
3202 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3203 continue
3204
3205
3206 class EscapistIE(InfoExtractor):
3207 """Information extractor for The Escapist """
3208
3209 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3210
3211 @staticmethod
3212 def suitable(url):
3213 return (re.match(EscapistIE._VALID_URL, url) is not None)
3214
3215 def report_extraction(self, showName):
3216 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3217
3218 def report_config_download(self, showName):
3219 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3220
3221 def _simplify_title(self, title):
3222 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3223 res = res.strip(ur'_')
3224 return res
3225
3226 def _real_extract(self, url):
3227 htmlParser = HTMLParser.HTMLParser()
3228
3229 mobj = re.match(self._VALID_URL, url)
3230 if mobj is None:
3231 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3232 return
3233 showName = mobj.group('showname')
3234 videoId = mobj.group('episode')
3235
3236 self.report_extraction(showName)
3237 try:
3238 webPage = urllib2.urlopen(url).read()
3239 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3241 return
3242
3243 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3244 description = htmlParser.unescape(descMatch.group(1))
3245 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3246 imgUrl = htmlParser.unescape(imgMatch.group(1))
3247 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3248 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3249 configUrlMatch = re.search('config=(.*)$', playerUrl)
3250 configUrl = urllib2.unquote(configUrlMatch.group(1))
3251
3252 self.report_config_download(showName)
3253 try:
3254 configJSON = urllib2.urlopen(configUrl).read()
3255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3257 return
3258
3259 # Technically, it's JavaScript, not JSON
3260 configJSON = configJSON.replace("'", '"')
3261
3262 try:
3263 config = json.loads(configJSON)
3264 except (ValueError,), err:
3265 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3266 return
3267
3268 playlist = config['playlist']
3269 videoUrl = playlist[1]['url']
3270
3271 self._downloader.increment_downloads()
3272 info = {
3273 'id': videoId,
3274 'url': videoUrl,
3275 'uploader': showName,
3276 'upload_date': None,
3277 'title': showName,
3278 'stitle': self._simplify_title(showName),
3279 'ext': 'flv',
3280 'format': 'flv',
3281 'thumbnail': imgUrl,
3282 'description': description,
3283 'player_url': playerUrl,
3284 }
3285
3286 try:
3287 self._downloader.process_info(info)
3288 except UnavailableVideoError, err:
3289 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3290
3291
3292
3293 class PostProcessor(object):
3294 """Post Processor class.
3295
3296 PostProcessor objects can be added to downloaders with their
3297 add_post_processor() method. When the downloader has finished a
3298 successful download, it will take its internal chain of PostProcessors
3299 and start calling the run() method on each one of them, first with
3300 an initial argument and then with the returned value of the previous
3301 PostProcessor.
3302
3303 The chain will be stopped if one of them ever returns None or the end
3304 of the chain is reached.
3305
3306 PostProcessor objects follow a "mutual registration" process similar
3307 to InfoExtractor objects.
3308 """
3309
3310 _downloader = None
3311
3312 def __init__(self, downloader=None):
3313 self._downloader = downloader
3314
3315 def set_downloader(self, downloader):
3316 """Sets the downloader for this PP."""
3317 self._downloader = downloader
3318
3319 def run(self, information):
3320 """Run the PostProcessor.
3321
3322 The "information" argument is a dictionary like the ones
3323 composed by InfoExtractors. The only difference is that this
3324 one has an extra field called "filepath" that points to the
3325 downloaded file.
3326
3327 When this method returns None, the postprocessing chain is
3328 stopped. However, this method may return an information
3329 dictionary that will be passed to the next postprocessing
3330 object in the chain. It can be the one it received after
3331 changing some fields.
3332
3333 In addition, this method may raise a PostProcessingError
3334 exception that will be taken into account by the downloader
3335 it was called from.
3336 """
3337 return information # by default, do nothing
3338
3339
3340 class FFmpegExtractAudioPP(PostProcessor):
3341
3342 def __init__(self, downloader=None, preferredcodec=None):
3343 PostProcessor.__init__(self, downloader)
3344 if preferredcodec is None:
3345 preferredcodec = 'best'
3346 self._preferredcodec = preferredcodec
3347
3348 @staticmethod
3349 def get_audio_codec(path):
3350 try:
3351 cmd = ['ffprobe', '-show_streams', '--', path]
3352 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3353 output = handle.communicate()[0]
3354 if handle.wait() != 0:
3355 return None
3356 except (IOError, OSError):
3357 return None
3358 audio_codec = None
3359 for line in output.split('\n'):
3360 if line.startswith('codec_name='):
3361 audio_codec = line.split('=')[1].strip()
3362 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3363 return audio_codec
3364 return None
3365
3366 @staticmethod
3367 def run_ffmpeg(path, out_path, codec, more_opts):
3368 try:
3369 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3370 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3371 return (ret == 0)
3372 except (IOError, OSError):
3373 return False
3374
3375 def run(self, information):
3376 path = information['filepath']
3377
3378 filecodec = self.get_audio_codec(path)
3379 if filecodec is None:
3380 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3381 return None
3382
3383 more_opts = []
3384 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3385 if filecodec == 'aac' or filecodec == 'mp3':
3386 # Lossless if possible
3387 acodec = 'copy'
3388 extension = filecodec
3389 if filecodec == 'aac':
3390 more_opts = ['-f', 'adts']
3391 else:
3392 # MP3 otherwise.
3393 acodec = 'libmp3lame'
3394 extension = 'mp3'
3395 more_opts = ['-ab', '128k']
3396 else:
3397 # We convert the audio (lossy)
3398 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3399 extension = self._preferredcodec
3400 more_opts = ['-ab', '128k']
3401 if self._preferredcodec == 'aac':
3402 more_opts += ['-f', 'adts']
3403
3404 (prefix, ext) = os.path.splitext(path)
3405 new_path = prefix + '.' + extension
3406 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3407 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3408
3409 if not status:
3410 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3411 return None
3412
3413 try:
3414 os.remove(path)
3415 except (IOError, OSError):
3416 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3417 return None
3418
3419 information['filepath'] = new_path
3420 return information
3421
3422
3423 def updateSelf(downloader, filename):
3424 ''' Update the program file with the latest version from the repository '''
3425 # Note: downloader only used for options
3426 if not os.access(filename, os.W_OK):
3427 sys.exit('ERROR: no write permissions on %s' % filename)
3428
3429 downloader.to_screen('Updating to latest version...')
3430
3431 try:
3432 try:
3433 urlh = urllib.urlopen(UPDATE_URL)
3434 newcontent = urlh.read()
3435 finally:
3436 urlh.close()
3437 except (IOError, OSError), err:
3438 sys.exit('ERROR: unable to download latest version')
3439
3440 try:
3441 outf = open(filename, 'wb')
3442 try:
3443 outf.write(newcontent)
3444 finally:
3445 outf.close()
3446 except (IOError, OSError), err:
3447 sys.exit('ERROR: unable to overwrite current version')
3448
3449 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3450
3451 def parseOpts():
3452 # Deferred imports
3453 import getpass
3454 import optparse
3455
3456 def _format_option_string(option):
3457 ''' ('-o', '--option') -> -o, --format METAVAR'''
3458
3459 opts = []
3460
3461 if option._short_opts: opts.append(option._short_opts[0])
3462 if option._long_opts: opts.append(option._long_opts[0])
3463 if len(opts) > 1: opts.insert(1, ', ')
3464
3465 if option.takes_value(): opts.append(' %s' % option.metavar)
3466
3467 return "".join(opts)
3468
3469 def _find_term_columns():
3470 columns = os.environ.get('COLUMNS', None)
3471 if columns:
3472 return int(columns)
3473
3474 try:
3475 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3476 out,err = sp.communicate()
3477 return int(out.split()[1])
3478 except:
3479 pass
3480 return None
3481
3482 max_width = 80
3483 max_help_position = 80
3484
3485 # No need to wrap help messages if we're on a wide console
3486 columns = _find_term_columns()
3487 if columns: max_width = columns
3488
3489 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3490 fmt.format_option_strings = _format_option_string
3491
3492 kw = {
3493 'version' : __version__,
3494 'formatter' : fmt,
3495 'usage' : '%prog [options] url [url...]',
3496 'conflict_handler' : 'resolve',
3497 }
3498
3499 parser = optparse.OptionParser(**kw)
3500
3501 # option groups
3502 general = optparse.OptionGroup(parser, 'General Options')
3503 selection = optparse.OptionGroup(parser, 'Video Selection')
3504 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3505 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3506 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3507 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3508 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3509
3510 general.add_option('-h', '--help',
3511 action='help', help='print this help text and exit')
3512 general.add_option('-v', '--version',
3513 action='version', help='print program version and exit')
3514 general.add_option('-U', '--update',
3515 action='store_true', dest='update_self', help='update this program to latest version')
3516 general.add_option('-i', '--ignore-errors',
3517 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3518 general.add_option('-r', '--rate-limit',
3519 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3520 general.add_option('-R', '--retries',
3521 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3522 general.add_option('--dump-user-agent',
3523 action='store_true', dest='dump_user_agent',
3524 help='display the current browser identification', default=False)
3525
3526 selection.add_option('--playlist-start',
3527 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3528 selection.add_option('--playlist-end',
3529 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3530 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3531 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3532
3533 authentication.add_option('-u', '--username',
3534 dest='username', metavar='USERNAME', help='account username')
3535 authentication.add_option('-p', '--password',
3536 dest='password', metavar='PASSWORD', help='account password')
3537 authentication.add_option('-n', '--netrc',
3538 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3539
3540
3541 video_format.add_option('-f', '--format',
3542 action='store', dest='format', metavar='FORMAT', help='video format code')
3543 video_format.add_option('--all-formats',
3544 action='store_const', dest='format', help='download all available video formats', const='-1')
3545 video_format.add_option('--max-quality',
3546 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3547
3548
3549 verbosity.add_option('-q', '--quiet',
3550 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3551 verbosity.add_option('-s', '--simulate',
3552 action='store_true', dest='simulate', help='do not download video', default=False)
3553 verbosity.add_option('-g', '--get-url',
3554 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3555 verbosity.add_option('-e', '--get-title',
3556 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3557 verbosity.add_option('--get-thumbnail',
3558 action='store_true', dest='getthumbnail',
3559 help='simulate, quiet but print thumbnail URL', default=False)
3560 verbosity.add_option('--get-description',
3561 action='store_true', dest='getdescription',
3562 help='simulate, quiet but print video description', default=False)
3563 verbosity.add_option('--get-filename',
3564 action='store_true', dest='getfilename',
3565 help='simulate, quiet but print output filename', default=False)
3566 verbosity.add_option('--no-progress',
3567 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3568 verbosity.add_option('--console-title',
3569 action='store_true', dest='consoletitle',
3570 help='display progress in console titlebar', default=False)
3571
3572
3573 filesystem.add_option('-t', '--title',
3574 action='store_true', dest='usetitle', help='use title in file name', default=False)
3575 filesystem.add_option('-l', '--literal',
3576 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3577 filesystem.add_option('-A', '--auto-number',
3578 action='store_true', dest='autonumber',
3579 help='number downloaded files starting from 00000', default=False)
3580 filesystem.add_option('-o', '--output',
3581 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3582 filesystem.add_option('-a', '--batch-file',
3583 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3584 filesystem.add_option('-w', '--no-overwrites',
3585 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3586 filesystem.add_option('-c', '--continue',
3587 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3588 filesystem.add_option('--cookies',
3589 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3590 filesystem.add_option('--no-part',
3591 action='store_true', dest='nopart', help='do not use .part files', default=False)
3592 filesystem.add_option('--no-mtime',
3593 action='store_false', dest='updatetime',
3594 help='do not use the Last-modified header to set the file modification time', default=True)
3595 filesystem.add_option('--write-description',
3596 action='store_true', dest='writedescription',
3597 help='write video description to a .description file', default=False)
3598 filesystem.add_option('--write-info-json',
3599 action='store_true', dest='writeinfojson',
3600 help='write video metadata to a .info.json file', default=False)
3601
3602
3603 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3604 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3605 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3606 help='"best", "aac" or "mp3"; best by default')
3607
3608
3609 parser.add_option_group(general)
3610 parser.add_option_group(selection)
3611 parser.add_option_group(filesystem)
3612 parser.add_option_group(verbosity)
3613 parser.add_option_group(video_format)
3614 parser.add_option_group(authentication)
3615 parser.add_option_group(postproc)
3616
3617 opts, args = parser.parse_args()
3618
3619 return parser, opts, args
3620
3621 def main():
3622 parser, opts, args = parseOpts()
3623
3624 # Open appropriate CookieJar
3625 if opts.cookiefile is None:
3626 jar = cookielib.CookieJar()
3627 else:
3628 try:
3629 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3630 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3631 jar.load()
3632 except (IOError, OSError), err:
3633 sys.exit(u'ERROR: unable to open cookie file')
3634
3635 # Dump user agent
3636 if opts.dump_user_agent:
3637 print std_headers['User-Agent']
3638 sys.exit(0)
3639
3640 # General configuration
3641 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3642 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3643 urllib2.install_opener(opener)
3644 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3645
3646 # Batch file verification
3647 batchurls = []
3648 if opts.batchfile is not None:
3649 try:
3650 if opts.batchfile == '-':
3651 batchfd = sys.stdin
3652 else:
3653 batchfd = open(opts.batchfile, 'r')
3654 batchurls = batchfd.readlines()
3655 batchurls = [x.strip() for x in batchurls]
3656 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3657 except IOError:
3658 sys.exit(u'ERROR: batch file could not be read')
3659 all_urls = batchurls + args
3660
3661 # Conflicting, missing and erroneous options
3662 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3663 parser.error(u'using .netrc conflicts with giving username/password')
3664 if opts.password is not None and opts.username is None:
3665 parser.error(u'account username missing')
3666 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3667 parser.error(u'using output template conflicts with using title, literal title or auto number')
3668 if opts.usetitle and opts.useliteral:
3669 parser.error(u'using title conflicts with using literal title')
3670 if opts.username is not None and opts.password is None:
3671 opts.password = getpass.getpass(u'Type account password and press return:')
3672 if opts.ratelimit is not None:
3673 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3674 if numeric_limit is None:
3675 parser.error(u'invalid rate limit specified')
3676 opts.ratelimit = numeric_limit
3677 if opts.retries is not None:
3678 try:
3679 opts.retries = long(opts.retries)
3680 except (TypeError, ValueError), err:
3681 parser.error(u'invalid retry count specified')
3682 try:
3683 opts.playliststart = int(opts.playliststart)
3684 if opts.playliststart <= 0:
3685 raise ValueError(u'Playlist start must be positive')
3686 except (TypeError, ValueError), err:
3687 parser.error(u'invalid playlist start number specified')
3688 try:
3689 opts.playlistend = int(opts.playlistend)
3690 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3691 raise ValueError(u'Playlist end must be greater than playlist start')
3692 except (TypeError, ValueError), err:
3693 parser.error(u'invalid playlist end number specified')
3694 if opts.extractaudio:
3695 if opts.audioformat not in ['best', 'aac', 'mp3']:
3696 parser.error(u'invalid audio format specified')
3697
3698 # Information extractors
3699 youtube_ie = YoutubeIE()
3700 google_ie = GoogleIE()
3701 yahoo_ie = YahooIE()
3702 extractors = [ # Order does matter
3703 youtube_ie,
3704 MetacafeIE(youtube_ie),
3705 DailymotionIE(),
3706 YoutubePlaylistIE(youtube_ie),
3707 YoutubeUserIE(youtube_ie),
3708 YoutubeSearchIE(youtube_ie),
3709 google_ie,
3710 GoogleSearchIE(google_ie),
3711 PhotobucketIE(),
3712 yahoo_ie,
3713 YahooSearchIE(yahoo_ie),
3714 DepositFilesIE(),
3715 FacebookIE(),
3716 BlipTVIE(),
3717 VimeoIE(),
3718 MyVideoIE(),
3719 ComedyCentralIE(),
3720 EscapistIE(),
3721
3722 GenericIE()
3723 ]
3724
3725 # File downloader
3726 fd = FileDownloader({
3727 'usenetrc': opts.usenetrc,
3728 'username': opts.username,
3729 'password': opts.password,
3730 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3731 'forceurl': opts.geturl,
3732 'forcetitle': opts.gettitle,
3733 'forcethumbnail': opts.getthumbnail,
3734 'forcedescription': opts.getdescription,
3735 'forcefilename': opts.getfilename,
3736 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3737 'format': opts.format,
3738 'format_limit': opts.format_limit,
3739 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3740 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3741 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3742 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3743 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3744 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3745 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3746 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3747 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3748 or u'%(id)s.%(ext)s'),
3749 'ignoreerrors': opts.ignoreerrors,
3750 'ratelimit': opts.ratelimit,
3751 'nooverwrites': opts.nooverwrites,
3752 'retries': opts.retries,
3753 'continuedl': opts.continue_dl,
3754 'noprogress': opts.noprogress,
3755 'playliststart': opts.playliststart,
3756 'playlistend': opts.playlistend,
3757 'logtostderr': opts.outtmpl == '-',
3758 'consoletitle': opts.consoletitle,
3759 'nopart': opts.nopart,
3760 'updatetime': opts.updatetime,
3761 'writedescription': opts.writedescription,
3762 'writeinfojson': opts.writeinfojson,
3763 'matchtitle': opts.matchtitle,
3764 'rejecttitle': opts.rejecttitle,
3765 })
3766 for extractor in extractors:
3767 fd.add_info_extractor(extractor)
3768
3769 # PostProcessors
3770 if opts.extractaudio:
3771 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3772
3773 # Update version
3774 if opts.update_self:
3775 updateSelf(fd, sys.argv[0])
3776
3777 # Maybe do nothing
3778 if len(all_urls) < 1:
3779 if not opts.update_self:
3780 parser.error(u'you must provide at least one URL')
3781 else:
3782 sys.exit()
3783 retcode = fd.download(all_urls)
3784
3785 # Dump cookie jar if requested
3786 if opts.cookiefile is not None:
3787 try:
3788 jar.save()
3789 except (IOError, OSError), err:
3790 sys.exit(u'ERROR: unable to save cookie jar')
3791
3792 sys.exit(retcode)
3793
3794
3795 if __name__ == '__main__':
3796 try:
3797 main()
3798 except DownloadError:
3799 sys.exit(1)
3800 except SameFileError:
3801 sys.exit(u'ERROR: fixed output name but more than one file to download')
3802 except KeyboardInterrupt:
3803 sys.exit(u'\nERROR: Interrupted by user')
3804
3805 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: