]> jfr.im git - yt-dlp.git/blame - youtube_dl/__init__.py
added --srt-lang; updated README; extended the -g FAQ
[yt-dlp.git] / youtube_dl / __init__.py
CommitLineData
235b3ba4
PH
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
3906e6ce 4__authors__ = (
235b3ba4
PH
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
b88a5250 17 'shizeeg',
235b3ba4
PH
18 )
19
20__license__ = 'Public Domain'
9c228928 21__version__ = '2012.02.27'
235b3ba4
PH
22
23UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
c9ed14e6 25
235b3ba4
PH
26import cookielib
27import datetime
c9ed14e6 28import getpass
235b3ba4
PH
29import gzip
30import htmlentitydefs
31import HTMLParser
32import httplib
33import locale
34import math
35import netrc
c9ed14e6 36import optparse
235b3ba4
PH
37import os
38import os.path
39import re
c9ed14e6 40import shlex
235b3ba4
PH
41import socket
42import string
43import subprocess
44import sys
45import time
46import urllib
47import urllib2
48import warnings
49import zlib
50
51if os.name == 'nt':
52 import ctypes
53
54try:
55 import email.utils
56except ImportError: # Python 2.4
57 import email.Utils
58try:
59 import cStringIO as StringIO
60except ImportError:
61 import StringIO
62
63# parse_qs was moved from the cgi module to the urlparse module recently.
64try:
65 from urlparse import parse_qs
66except ImportError:
67 from cgi import parse_qs
68
69try:
70 import lxml.etree
71except ImportError:
72 pass # Handled below
73
74try:
75 import xml.etree.ElementTree
76except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79std_headers = {
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
85}
86
235b3ba4
PH
87try:
88 import json
89except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 import re
91 class json(object):
92 @staticmethod
93 def loads(s):
94 s = s.decode('UTF-8')
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
99 i += 1
100 if expectMore:
101 if i >= len(s):
102 raiseError('Premature end', i)
103 return i
104 def decodeEscape(match):
105 esc = match.group(1)
106 _STATIC = {
107 '"': '"',
108 '\\': '\\',
109 '/': '/',
110 'b': unichr(0x8),
111 'f': unichr(0xc),
112 'n': '\n',
113 'r': '\r',
114 't': '\t',
115 }
116 if esc in _STATIC:
117 return _STATIC[esc]
118 if esc[0] == 'u':
119 if len(esc) == 1+4:
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
126 def parseString(i):
127 i += 1
128 e = i
129 while True:
130 e = s.index('"', e)
131 bslashes = 0
132 while s[e-bslashes-1] == '\\':
133 bslashes += 1
134 if bslashes % 2 == 1:
135 e += 1
136 continue
137 break
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
140 return (e+1,stri)
141 def parseObj(i):
142 i += 1
143 res = {}
144 i = skipSpace(i)
145 if s[i] == '}': # Empty dictionary
146 return (i+1,res)
147 while True:
148 if s[i] != '"':
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
151 i = skipSpace(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
154 i,val = parse(i+1)
155 res[key] = val
156 i = skipSpace(i)
157 if s[i] == '}':
158 return (i+1, res)
159 if s[i] != ',':
160 raiseError('Expected comma or closing curly brace', i)
161 i = skipSpace(i+1)
162 def parseArray(i):
163 res = []
164 i = skipSpace(i+1)
165 if s[i] == ']': # Empty array
166 return (i+1,res)
167 while True:
168 i,val = parse(i)
169 res.append(val)
170 i = skipSpace(i) # Raise exception if premature end
171 if s[i] == ']':
172 return (i+1, res)
173 if s[i] != ',':
174 raiseError('Expected a comma or closing bracket', i)
175 i = skipSpace(i+1)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
179 return (i+len(k), v)
180 raiseError('Not a boolean (or null)', i)
181 def parseNumber(i):
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183 if mobj is None:
184 raiseError('Not a number', i)
185 nums = mobj.group(1)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190 def parse(i):
191 i = skipSpace(i)
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
194 return (i,res)
195 i,res = parse(0)
196 if i < len(s):
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198 return res
199
200def preferredencoding():
201 """Get preferred encoding.
202
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 def yield_preferredencoding():
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
212 while True:
213 yield pref
214 return yield_preferredencoding().next()
215
216
217def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
219
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
222 """
223 entity = matchobj.group(1)
224
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
228
229 # Unicode character
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
231 if mobj is not None:
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
234 base = 16
235 numstr = u'0%s' % numstr
236 else:
237 base = 10
238 return unichr(long(numstr, base))
239
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
242
243
244def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
248
249
250def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
252
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
256 function.
257
258 It returns the tuple (stream, definitive_file_name).
259 """
260 try:
261 if filename == u'-':
262 if sys.platform == 'win32':
263 import msvcrt
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
fefb166c 266 stream = open(_encodeFilename(filename), open_mode)
235b3ba4
PH
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272 # An exception here should be caught in the caller
fefb166c 273 stream = open(_encodeFilename(filename), open_mode)
235b3ba4
PH
274 return (stream, filename)
275
276
277def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
279 timestamp = None
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
283 return timestamp
284
e33e3045 285def _simplify_title(title):
af8e8d63
PH
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
235b3ba4 288
0b14e0b3
PH
289def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
291 res = []
292 for el in iterable:
293 if el not in res:
294 res.append(el)
295 return res
296
1413cd87 297def _unescapeHTML(s):
fefb166c
PH
298 """
299 @param s a string (of type unicode)
300 """
301 assert type(s) == type(u'')
302
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
1413cd87 305
fefb166c
PH
306def _encodeFilename(s):
307 """
308 @param s The name of the file (of type unicode)
309 """
310
311 assert type(s) == type(u'')
4afdff39
MM
312
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317 return s
318 else:
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
1413cd87 320
235b3ba4
PH
321class DownloadError(Exception):
322 """Download Error exception.
323
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
326 error message.
327 """
328 pass
329
330
331class SameFileError(Exception):
332 """Same File exception.
333
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
336 """
337 pass
338
339
340class PostProcessingError(Exception):
341 """Post Processing exception.
342
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
345 """
346 pass
347
94fd3201
PH
348class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
350 pass
351
235b3ba4
PH
352
353class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
355
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
358 """
359 pass
360
361
362class ContentTooShortError(Exception):
363 """Content Too Short exception.
364
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
368 """
369 # Both in bytes
370 downloaded = None
371 expected = None
372
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
376
377
378class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
380
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
387
388 Part of this code was copied from:
389
390 http://techknack.net/python-urllib2-handlers/
391
392 Andrew Rowls, the author of that code, agreed to release it to the
393 public domain.
394 """
395
396 @staticmethod
397 def deflate(data):
398 try:
399 return zlib.decompress(data, -zlib.MAX_WBITS)
400 except zlib.error:
401 return zlib.decompress(data)
402
403 @staticmethod
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
408 ret.code = code
409 return ret
410
411 def http_request(self, req):
412 for h in std_headers:
413 if h in req.headers:
414 del req.headers[h]
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
420 return req
421
422 def http_response(self, req, resp):
423 old_resp = resp
424 # gzip
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
429 # deflate
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
434 return resp
435
436
437class FileDownloader(object):
438 """File Downloader class.
439
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
446
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
454
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
461
462 Available options:
463
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
a0432a1e
FV
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
235b3ba4
PH
495 """
496
497 params = None
498 _ies = []
499 _pps = []
500 _download_retcode = None
501 _num_downloads = None
502 _screen_file = None
503
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
506 self._ies = []
507 self._pps = []
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
511 self.params = params
512
513 @staticmethod
514 def format_bytes(bytes):
515 if bytes is None:
516 return 'N/A'
517 if type(bytes) is str:
518 bytes = float(bytes)
519 if bytes == 0.0:
520 exponent = 0
521 else:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
526
527 @staticmethod
528 def calc_percent(byte_counter, data_len):
529 if data_len is None:
530 return '---.-%'
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532
533 @staticmethod
534 def calc_eta(start, now, total, current):
535 if total is None:
536 return '--:--'
537 dif = now - start
538 if current == 0 or dif < 0.001: # One millisecond
539 return '--:--'
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
543 if eta_mins > 99:
544 return '--:--'
545 return '%02d:%02d' % (eta_mins, eta_secs)
546
547 @staticmethod
548 def calc_speed(start, now, bytes):
549 dif = now - start
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553
554 @staticmethod
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
559 return long(new_max)
560 rate = bytes / elapsed_time
561 if rate > new_max:
562 return long(new_max)
563 if rate < new_min:
564 return long(new_min)
565 return long(rate)
566
567 @staticmethod
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 if matchobj is None:
572 return None
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
576
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
579 self._ies.append(ie)
580 ie.set_downloader(self)
581
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
584 self._pps.append(pp)
585 pp.set_downloader(self)
586
fefb166c 587 def to_screen(self, message, skip_eol=False):
235b3ba4 588 """Print message to stdout if not in quiet mode."""
fefb166c
PH
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
593
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
235b3ba4 597 self._screen_file.flush()
235b3ba4
PH
598
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
602
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
606 return
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
613
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
617
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
620
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
624 """
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
630
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
635 return
636 now = time.time()
637 elapsed = now - start_time
638 if elapsed <= 0.0:
639 return
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
643
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
fefb166c 647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
235b3ba4
PH
648 return filename
649 return filename + u'.part'
650
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
654 return filename
655
656 def try_rename(self, old_filename, new_filename):
657 try:
658 if old_filename == new_filename:
659 return
fefb166c 660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
235b3ba4
PH
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
663
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
667 return
fefb166c 668 if not os.path.isfile(_encodeFilename(filename)):
235b3ba4
PH
669 return
670 timestr = last_modified_hdr
671 if timestr is None:
672 return
673 filetime = timeconvert(timestr)
674 if filetime is None:
675 return filetime
676 try:
677 os.utime(filename, (time.time(), filetime))
678 except:
679 pass
680 return filetime
681
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
fefb166c 684 self.to_screen(u'[info] Writing video description to: ' + descfn)
235b3ba4 685
a0432a1e
FV
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
689
235b3ba4
PH
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
fefb166c 692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
235b3ba4
PH
693
694 def report_destination(self, filename):
695 """Report destination filename."""
fefb166c 696 self.to_screen(u'[download] Destination: ' + filename)
235b3ba4
PH
697
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
701 return
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
706
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
710
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
714
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
717 try:
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
721
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
725
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
730 else:
731 self.to_screen(u'')
732
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
736
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
739 try:
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
744 return filename
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
747 return None
748
77315556
PH
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
751
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
759 return None
760
235b3ba4
PH
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
b88a5250 763
77315556
PH
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
767 return
768
c379c181 769 max_downloads = self.params.get('max_downloads')
b88a5250 770 if max_downloads is not None:
c379c181 771 if self._num_downloads > int(max_downloads):
94fd3201 772 raise MaxDownloadsReached()
77315556 773
235b3ba4
PH
774 filename = self.prepare_filename(info_dict)
775
776 # Forced printings
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
789
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
792 return
793
794 if filename is None:
795 return
796
235b3ba4 797 try:
fefb166c
PH
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
235b3ba4
PH
800 os.makedirs(dn)
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
803 return
804
805 if self.params.get('writedescription', False):
806 try:
fefb166c 807 descfn = filename + u'.description'
235b3ba4 808 self.report_writedescription(descfn)
fefb166c 809 descfile = open(_encodeFilename(descfn), 'wb')
235b3ba4
PH
810 try:
811 descfile.write(info_dict['description'].encode('utf-8'))
812 finally:
813 descfile.close()
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
816 return
a0432a1e
FV
817
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
821 try:
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
825 try:
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
827 finally:
828 srtfile.close()
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
831 return
235b3ba4
PH
832
833 if self.params.get('writeinfojson', False):
fefb166c 834 infofn = filename + u'.info.json'
235b3ba4
PH
835 self.report_writeinfojson(infofn)
836 try:
837 json.dump
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
840 return
841 try:
fefb166c 842 infof = open(_encodeFilename(infofn), 'wb')
235b3ba4
PH
843 try:
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
846 finally:
847 infof.close()
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
850 return
851
852 if not self.params.get('skip_download', False):
fefb166c 853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
a5647b79
PH
854 success = True
855 else:
856 try:
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
862 return
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
865 return
235b3ba4
PH
866
867 if success:
868 try:
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
872 return
873
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
878
879 for url in url_list:
880 suitable_found = False
881 for ie in self._ies:
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
884 continue
885
886 # Suitable InfoExtractor found
887 suitable_found = True
888
889 # Extract information from URL and process it
890 ie.extract(url)
891
892 # Suitable InfoExtractor had been found; go to next URL
893 break
894
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
897
898 return self._download_retcode
899
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
902 info = dict(ie_info)
903 info['filepath'] = filename
904 for pp in self._pps:
905 info = pp.run(info)
906 if info is None:
907 break
908
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
912
913 # Check for rtmpdump first
914 try:
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
918 return False
919
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
871dbd3c 924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
ff3a2b8e 925 if self.params.get('verbose', False):
871dbd3c
PH
926 try:
927 import pipes
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
929 except ImportError:
930 shell_quote = repr
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
235b3ba4 933 while retval == 2 or retval == 1:
fefb166c 934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
235b3ba4
PH
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
fefb166c 938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
235b3ba4
PH
939 if prevsize == cursize and retval == 1:
940 break
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
944 retval = 0
945 break
946 if retval == 0:
fefb166c 947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
235b3ba4
PH
948 self.try_rename(tmpfilename, filename)
949 return True
950 else:
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
952 return False
953
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
957
958 # Check file already present
fefb166c 959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
235b3ba4
PH
960 self.report_file_already_downloaded(filename)
961 return True
962
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
966
967 tmpfilename = self.temp_name(filename)
968 stream = None
969
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
974
975 # Establish possible resume length
fefb166c
PH
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
235b3ba4
PH
978 else:
979 resume_len = 0
980
981 open_mode = 'wb'
982 if resume_len != 0:
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
986 open_mode = 'ab'
987 else:
988 resume_len = 0
989
990 count = 0
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
994 try:
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
998 break
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1002 raise
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1005 try:
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1011 raise
1012 else:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1025 return True
1026 else:
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1029 open_mode = 'wb'
1030 break
1031 # Retry
1032 count += 1
1033 if count <= retries:
1034 self.report_retry(count, retries)
1035
1036 if count > retries:
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1038 return False
1039
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1045 block_size = 1024
1046 start = time.time()
1047 while True:
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1051 after = time.time()
1052 if len(data_block) == 0:
1053 break
1054 byte_counter += len(data_block)
1055
1056 # Open file just in time
1057 if stream is None:
1058 try:
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065 return False
1066 try:
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070 return False
1071 block_size = self.best_block_size(after - before, len(data_block))
1072
1073 # Progress message
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077 else:
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082 # Apply rate limit
1083 self.slow_down(start, byte_counter - resume_len)
1084
1085 if stream is None:
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1087 return False
1088 stream.close()
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1093
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098 return True
1099
1100
1101class InfoExtractor(object):
1102 """Information Extractor class.
1103
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1112
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1121
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1126
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1129
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1133 """
1134
1135 _ready = False
1136 _downloader = None
1137
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1140 self._ready = False
1141 self.set_downloader(downloader)
1142
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1146
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1149 if not self._ready:
1150 self._real_initialize()
1151 self._ready = True
1152
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1155 self.initialize()
1156 return self._real_extract(url)
1157
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1161
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1164 pass
1165
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1168 pass
1169
1170
1171class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1173
1174 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178 _NETRC_MACHINE = 'youtube'
1179 # Listed in order of quality
1180 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
23e6b8ad 1181 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
235b3ba4
PH
1182 _video_extensions = {
1183 '13': '3gp',
1184 '17': 'mp4',
1185 '18': 'mp4',
1186 '22': 'mp4',
1187 '37': 'mp4',
1188 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1189 '43': 'webm',
1190 '44': 'webm',
1191 '45': 'webm',
1192 }
1193 _video_dimensions = {
1194 '5': '240x400',
1195 '6': '???',
1196 '13': '???',
1197 '17': '144x176',
1198 '18': '360x640',
1199 '22': '720x1280',
1200 '34': '360x640',
1201 '35': '480x854',
1202 '37': '1080x1920',
1203 '38': '3072x4096',
1204 '43': '360x640',
1205 '44': '480x854',
1206 '45': '720x1280',
1207 }
1208 IE_NAME = u'youtube'
1209
1210 def report_lang(self):
1211 """Report attempt to set language."""
1212 self._downloader.to_screen(u'[youtube] Setting language')
1213
1214 def report_login(self):
1215 """Report attempt to log in."""
1216 self._downloader.to_screen(u'[youtube] Logging in')
1217
1218 def report_age_confirmation(self):
1219 """Report attempt to confirm age."""
1220 self._downloader.to_screen(u'[youtube] Confirming age')
1221
1222 def report_video_webpage_download(self, video_id):
1223 """Report attempt to download video webpage."""
1224 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1225
1226 def report_video_info_webpage_download(self, video_id):
1227 """Report attempt to download video info webpage."""
1228 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1229
a0432a1e
FV
1230 def report_video_subtitles_download(self, video_id):
1231 """Report attempt to download video info webpage."""
1232 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1233
235b3ba4
PH
1234 def report_information_extraction(self, video_id):
1235 """Report attempt to extract video information."""
1236 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1237
1238 def report_unavailable_format(self, video_id, format):
1239 """Report extracted video URL."""
1240 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1241
1242 def report_rtmp_download(self):
1243 """Indicate the download will use the RTMP protocol."""
1244 self._downloader.to_screen(u'[youtube] RTMP download detected')
1245
a0432a1e
FV
1246 def _closed_captions_xml_to_srt(self, xml_string):
1247 srt = ''
1248 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1249 # TODO parse xml instead of regex
1250 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1251 if not dur: dur = '4'
1252 start = float(start)
1253 end = start + float(dur)
1254 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1255 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1256 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1258 srt += str(n) + '\n'
1259 srt += start + ' --> ' + end + '\n'
1260 srt += caption + '\n\n'
1261 return srt
1262
235b3ba4
PH
1263 def _print_formats(self, formats):
1264 print 'Available formats:'
1265 for x in formats:
1266 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1267
1268 def _real_initialize(self):
1269 if self._downloader is None:
1270 return
1271
1272 username = None
1273 password = None
1274 downloader_params = self._downloader.params
1275
1276 # Attempt to use provided username and password or .netrc data
1277 if downloader_params.get('username', None) is not None:
1278 username = downloader_params['username']
1279 password = downloader_params['password']
1280 elif downloader_params.get('usenetrc', False):
1281 try:
1282 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1283 if info is not None:
1284 username = info[0]
1285 password = info[2]
1286 else:
1287 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1288 except (IOError, netrc.NetrcParseError), err:
1289 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1290 return
1291
1292 # Set language
1293 request = urllib2.Request(self._LANG_URL)
1294 try:
1295 self.report_lang()
1296 urllib2.urlopen(request).read()
1297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1298 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1299 return
1300
1301 # No authentication to be performed
1302 if username is None:
1303 return
1304
1305 # Log in
1306 login_form = {
1307 'current_form': 'loginForm',
1308 'next': '/',
1309 'action_login': 'Log In',
1310 'username': username,
1311 'password': password,
1312 }
1313 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1314 try:
1315 self.report_login()
1316 login_results = urllib2.urlopen(request).read()
1317 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1318 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1319 return
1320 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1321 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1322 return
1323
1324 # Confirm age
1325 age_form = {
1326 'next_url': '/',
1327 'action_confirm': 'Confirm',
1328 }
1329 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1330 try:
1331 self.report_age_confirmation()
1332 age_results = urllib2.urlopen(request).read()
1333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1335 return
1336
1337 def _real_extract(self, url):
1338 # Extract video id from URL
1339 mobj = re.match(self._VALID_URL, url)
1340 if mobj is None:
1341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1342 return
1343 video_id = mobj.group(2)
1344
1345 # Get video webpage
1346 self.report_video_webpage_download(video_id)
1347 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1348 try:
1349 video_webpage = urllib2.urlopen(request).read()
1350 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1351 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1352 return
1353
1354 # Attempt to extract SWF player URL
1355 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1356 if mobj is not None:
1357 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1358 else:
1359 player_url = None
1360
1361 # Get video info
1362 self.report_video_info_webpage_download(video_id)
1363 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1364 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1365 % (video_id, el_type))
1366 request = urllib2.Request(video_info_url)
1367 try:
1368 video_info_webpage = urllib2.urlopen(request).read()
1369 video_info = parse_qs(video_info_webpage)
1370 if 'token' in video_info:
1371 break
1372 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1373 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1374 return
1375 if 'token' not in video_info:
1376 if 'reason' in video_info:
1377 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1378 else:
1379 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1380 return
1381
1382 # Start extracting information
1383 self.report_information_extraction(video_id)
1384
1385 # uploader
1386 if 'author' not in video_info:
1387 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1388 return
1389 video_uploader = urllib.unquote_plus(video_info['author'][0])
1390
1391 # title
1392 if 'title' not in video_info:
1393 self._downloader.trouble(u'ERROR: unable to extract video title')
1394 return
1395 video_title = urllib.unquote_plus(video_info['title'][0])
1396 video_title = video_title.decode('utf-8')
1397 video_title = sanitize_title(video_title)
1398
1399 # simplified title
e092418d 1400 simple_title = _simplify_title(video_title)
235b3ba4
PH
1401
1402 # thumbnail image
1403 if 'thumbnail_url' not in video_info:
1404 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1405 video_thumbnail = ''
1406 else: # don't panic if we can't find it
1407 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1408
1409 # upload date
1410 upload_date = u'NA'
1411 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1412 if mobj is not None:
1413 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1414 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1415 for expression in format_expressions:
1416 try:
1417 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1418 except:
1419 pass
1420
1421 # description
1422 try:
1423 lxml.etree
1424 except NameError:
1425 video_description = u'No description available.'
ff3a2b8e
PH
1426 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1427 if mobj is not None:
1428 video_description = mobj.group(1).decode('utf-8')
235b3ba4
PH
1429 else:
1430 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1431 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1432 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1433 # TODO use another parser
a0432a1e
FV
1434
1435 # closed captions
1436 video_subtitles = None
1437 if self._downloader.params.get('writesubtitles', False):
1438 self.report_video_subtitles_download(video_id)
1439 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1440 try:
1441 srt_list = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1444 else:
1445 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1446 if srt_lang_list:
1447 if self._downloader.params.get('subtitleslang', False):
1448 srt_lang = self._downloader.params.get('subtitleslang')
1449 elif 'en' in srt_lang_list:
1450 srt_lang = 'en'
1451 else:
1452 srt_lang = srt_lang_list[0]
1453 if not srt_lang in srt_lang_list:
1454 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1455 else:
1456 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1457 try:
1458 srt_xml = urllib2.urlopen(request).read()
1459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1460 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1461 else:
1462 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1463 else:
1464 self._downloader.trouble(u'WARNING: video has no closed captions')
235b3ba4
PH
1465
1466 # token
1467 video_token = urllib.unquote_plus(video_info['token'][0])
1468
1469 # Decide which formats to download
1470 req_format = self._downloader.params.get('format', None)
1471
1472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1473 self.report_rtmp_download()
1474 video_url_list = [(None, video_info['conn'][0])]
1475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1477 url_data = [parse_qs(uds) for uds in url_data_strs]
1478 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1479 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1480
1481 format_limit = self._downloader.params.get('format_limit', None)
23e6b8ad
PH
1482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1483 if format_limit is not None and format_limit in available_formats:
1484 format_list = available_formats[available_formats.index(format_limit):]
235b3ba4 1485 else:
23e6b8ad 1486 format_list = available_formats
235b3ba4
PH
1487 existing_formats = [x for x in format_list if x in url_map]
1488 if len(existing_formats) == 0:
1489 self._downloader.trouble(u'ERROR: no known formats available for video')
1490 return
1491 if self._downloader.params.get('listformats', None):
1492 self._print_formats(existing_formats)
1493 return
1494 if req_format is None or req_format == 'best':
1495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1496 elif req_format == 'worst':
1497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1498 elif req_format in ('-1', 'all'):
1499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1500 else:
1501 # Specific formats. We pick the first in a slash-delimeted sequence.
1502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1503 req_formats = req_format.split('/')
1504 video_url_list = None
1505 for rf in req_formats:
1506 if rf in url_map:
1507 video_url_list = [(rf, url_map[rf])]
1508 break
1509 if video_url_list is None:
1510 self._downloader.trouble(u'ERROR: requested format not available')
1511 return
1512 else:
1513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1514 return
1515
1516 for format_param, video_real_url in video_url_list:
1517 # At this point we have a new video
1518 self._downloader.increment_downloads()
1519
1520 # Extension
1521 video_extension = self._video_extensions.get(format_param, 'flv')
1522
1523 try:
1524 # Process video information
1525 self._downloader.process_info({
1526 'id': video_id.decode('utf-8'),
1527 'url': video_real_url.decode('utf-8'),
1528 'uploader': video_uploader.decode('utf-8'),
1529 'upload_date': upload_date,
1530 'title': video_title,
1531 'stitle': simple_title,
1532 'ext': video_extension.decode('utf-8'),
1533 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1534 'thumbnail': video_thumbnail.decode('utf-8'),
1535 'description': video_description,
1536 'player_url': player_url,
a0432a1e 1537 'subtitles': video_subtitles
235b3ba4
PH
1538 })
1539 except UnavailableVideoError, err:
1540 self._downloader.trouble(u'\nERROR: unable to download video')
1541
1542
1543class MetacafeIE(InfoExtractor):
1544 """Information Extractor for metacafe.com."""
1545
1546 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1547 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1548 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1549 _youtube_ie = None
1550 IE_NAME = u'metacafe'
1551
1552 def __init__(self, youtube_ie, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1554 self._youtube_ie = youtube_ie
1555
1556 def report_disclaimer(self):
1557 """Report disclaimer retrieval."""
1558 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1559
1560 def report_age_confirmation(self):
1561 """Report attempt to confirm age."""
1562 self._downloader.to_screen(u'[metacafe] Confirming age')
1563
1564 def report_download_webpage(self, video_id):
1565 """Report webpage download."""
1566 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1567
1568 def report_extraction(self, video_id):
1569 """Report information extraction."""
1570 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1571
1572 def _real_initialize(self):
1573 # Retrieve disclaimer
1574 request = urllib2.Request(self._DISCLAIMER)
1575 try:
1576 self.report_disclaimer()
1577 disclaimer = urllib2.urlopen(request).read()
1578 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1579 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1580 return
1581
1582 # Confirm age
1583 disclaimer_form = {
1584 'filters': '0',
1585 'submit': "Continue - I'm over 18",
1586 }
1587 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1588 try:
1589 self.report_age_confirmation()
1590 disclaimer = urllib2.urlopen(request).read()
1591 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1592 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1593 return
1594
1595 def _real_extract(self, url):
1596 # Extract id and simplified title from URL
1597 mobj = re.match(self._VALID_URL, url)
1598 if mobj is None:
1599 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1600 return
1601
1602 video_id = mobj.group(1)
1603
1604 # Check if video comes from YouTube
1605 mobj2 = re.match(r'^yt-(.*)$', video_id)
1606 if mobj2 is not None:
1607 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1608 return
1609
1610 # At this point we have a new video
1611 self._downloader.increment_downloads()
1612
1613 simple_title = mobj.group(2).decode('utf-8')
1614
1615 # Retrieve video webpage to extract further information
1616 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1617 try:
1618 self.report_download_webpage(video_id)
1619 webpage = urllib2.urlopen(request).read()
1620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1621 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1622 return
1623
1624 # Extract URL, uploader and title from webpage
1625 self.report_extraction(video_id)
1626 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1627 if mobj is not None:
1628 mediaURL = urllib.unquote(mobj.group(1))
1629 video_extension = mediaURL[-3:]
1630
1631 # Extract gdaKey if available
1632 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1633 if mobj is None:
1634 video_url = mediaURL
1635 else:
1636 gdaKey = mobj.group(1)
1637 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1638 else:
1639 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1640 if mobj is None:
1641 self._downloader.trouble(u'ERROR: unable to extract media URL')
1642 return
1643 vardict = parse_qs(mobj.group(1))
1644 if 'mediaData' not in vardict:
1645 self._downloader.trouble(u'ERROR: unable to extract media URL')
1646 return
1647 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1648 if mobj is None:
1649 self._downloader.trouble(u'ERROR: unable to extract media URL')
1650 return
1651 mediaURL = mobj.group(1).replace('\\/', '/')
1652 video_extension = mediaURL[-3:]
1653 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1654
1655 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1656 if mobj is None:
1657 self._downloader.trouble(u'ERROR: unable to extract title')
1658 return
1659 video_title = mobj.group(1).decode('utf-8')
1660 video_title = sanitize_title(video_title)
1661
1662 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1663 if mobj is None:
1664 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1665 return
1666 video_uploader = mobj.group(1)
1667
1668 try:
1669 # Process video information
1670 self._downloader.process_info({
1671 'id': video_id.decode('utf-8'),
1672 'url': video_url.decode('utf-8'),
1673 'uploader': video_uploader.decode('utf-8'),
1674 'upload_date': u'NA',
1675 'title': video_title,
1676 'stitle': simple_title,
1677 'ext': video_extension.decode('utf-8'),
1678 'format': u'NA',
1679 'player_url': None,
1680 })
1681 except UnavailableVideoError:
1682 self._downloader.trouble(u'\nERROR: unable to download video')
1683
1684
1685class DailymotionIE(InfoExtractor):
1686 """Information Extractor for Dailymotion"""
1687
1688 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1689 IE_NAME = u'dailymotion'
1690
1691 def __init__(self, downloader=None):
1692 InfoExtractor.__init__(self, downloader)
1693
1694 def report_download_webpage(self, video_id):
1695 """Report webpage download."""
1696 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1697
1698 def report_extraction(self, video_id):
1699 """Report information extraction."""
1700 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1701
1702 def _real_extract(self, url):
1703 # Extract id and simplified title from URL
1704 mobj = re.match(self._VALID_URL, url)
1705 if mobj is None:
1706 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1707 return
1708
1709 # At this point we have a new video
1710 self._downloader.increment_downloads()
1711 video_id = mobj.group(1)
1712
235b3ba4
PH
1713 video_extension = 'flv'
1714
1715 # Retrieve video webpage to extract further information
1716 request = urllib2.Request(url)
1717 request.add_header('Cookie', 'family_filter=off')
1718 try:
1719 self.report_download_webpage(video_id)
1720 webpage = urllib2.urlopen(request).read()
1721 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1722 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1723 return
1724
1725 # Extract URL, uploader and title from webpage
1726 self.report_extraction(video_id)
1727 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1728 if mobj is None:
1729 self._downloader.trouble(u'ERROR: unable to extract media URL')
1730 return
1731 sequence = urllib.unquote(mobj.group(1))
1732 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1733 if mobj is None:
1734 self._downloader.trouble(u'ERROR: unable to extract media URL')
1735 return
1736 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1737
1738 # if needed add http://www.dailymotion.com/ if relative URL
1739
1740 video_url = mediaURL
1741
bb8abbbb 1742 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
235b3ba4
PH
1743 if mobj is None:
1744 self._downloader.trouble(u'ERROR: unable to extract title')
1745 return
1413cd87 1746 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
235b3ba4 1747 video_title = sanitize_title(video_title)
bb8abbbb 1748 simple_title = _simplify_title(video_title)
235b3ba4
PH
1749
1750 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1751 if mobj is None:
1752 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1753 return
1754 video_uploader = mobj.group(1)
1755
1756 try:
1757 # Process video information
1758 self._downloader.process_info({
1759 'id': video_id.decode('utf-8'),
1760 'url': video_url.decode('utf-8'),
1761 'uploader': video_uploader.decode('utf-8'),
1762 'upload_date': u'NA',
1763 'title': video_title,
1764 'stitle': simple_title,
1765 'ext': video_extension.decode('utf-8'),
1766 'format': u'NA',
1767 'player_url': None,
1768 })
1769 except UnavailableVideoError:
1770 self._downloader.trouble(u'\nERROR: unable to download video')
1771
1772
1773class GoogleIE(InfoExtractor):
1774 """Information extractor for video.google.com."""
1775
1776 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1777 IE_NAME = u'video.google'
1778
1779 def __init__(self, downloader=None):
1780 InfoExtractor.__init__(self, downloader)
1781
1782 def report_download_webpage(self, video_id):
1783 """Report webpage download."""
1784 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1785
1786 def report_extraction(self, video_id):
1787 """Report information extraction."""
1788 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1789
1790 def _real_extract(self, url):
1791 # Extract id from URL
1792 mobj = re.match(self._VALID_URL, url)
1793 if mobj is None:
1794 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1795 return
1796
1797 # At this point we have a new video
1798 self._downloader.increment_downloads()
1799 video_id = mobj.group(1)
1800
1801 video_extension = 'mp4'
1802
1803 # Retrieve video webpage to extract further information
1804 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1805 try:
1806 self.report_download_webpage(video_id)
1807 webpage = urllib2.urlopen(request).read()
1808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1809 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1810 return
1811
1812 # Extract URL, uploader, and title from webpage
1813 self.report_extraction(video_id)
1814 mobj = re.search(r"download_url:'([^']+)'", webpage)
1815 if mobj is None:
1816 video_extension = 'flv'
1817 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1818 if mobj is None:
1819 self._downloader.trouble(u'ERROR: unable to extract media URL')
1820 return
1821 mediaURL = urllib.unquote(mobj.group(1))
1822 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1823 mediaURL = mediaURL.replace('\\x26', '\x26')
1824
1825 video_url = mediaURL
1826
1827 mobj = re.search(r'<title>(.*)</title>', webpage)
1828 if mobj is None:
1829 self._downloader.trouble(u'ERROR: unable to extract title')
1830 return
1831 video_title = mobj.group(1).decode('utf-8')
1832 video_title = sanitize_title(video_title)
e092418d 1833 simple_title = _simplify_title(video_title)
235b3ba4
PH
1834
1835 # Extract video description
1836 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1837 if mobj is None:
1838 self._downloader.trouble(u'ERROR: unable to extract video description')
1839 return
1840 video_description = mobj.group(1).decode('utf-8')
1841 if not video_description:
1842 video_description = 'No description available.'
1843
1844 # Extract video thumbnail
1845 if self._downloader.params.get('forcethumbnail', False):
1846 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1847 try:
1848 webpage = urllib2.urlopen(request).read()
1849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1851 return
1852 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1853 if mobj is None:
1854 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1855 return
1856 video_thumbnail = mobj.group(1)
1857 else: # we need something to pass to process_info
1858 video_thumbnail = ''
1859
1860 try:
1861 # Process video information
1862 self._downloader.process_info({
1863 'id': video_id.decode('utf-8'),
1864 'url': video_url.decode('utf-8'),
1865 'uploader': u'NA',
1866 'upload_date': u'NA',
1867 'title': video_title,
1868 'stitle': simple_title,
1869 'ext': video_extension.decode('utf-8'),
1870 'format': u'NA',
1871 'player_url': None,
1872 })
1873 except UnavailableVideoError:
1874 self._downloader.trouble(u'\nERROR: unable to download video')
1875
1876
1877class PhotobucketIE(InfoExtractor):
1878 """Information extractor for photobucket.com."""
1879
1880 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1881 IE_NAME = u'photobucket'
1882
1883 def __init__(self, downloader=None):
1884 InfoExtractor.__init__(self, downloader)
1885
1886 def report_download_webpage(self, video_id):
1887 """Report webpage download."""
1888 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1889
1890 def report_extraction(self, video_id):
1891 """Report information extraction."""
1892 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1893
1894 def _real_extract(self, url):
1895 # Extract id from URL
1896 mobj = re.match(self._VALID_URL, url)
1897 if mobj is None:
1898 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1899 return
1900
1901 # At this point we have a new video
1902 self._downloader.increment_downloads()
1903 video_id = mobj.group(1)
1904
1905 video_extension = 'flv'
1906
1907 # Retrieve video webpage to extract further information
1908 request = urllib2.Request(url)
1909 try:
1910 self.report_download_webpage(video_id)
1911 webpage = urllib2.urlopen(request).read()
1912 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1914 return
1915
1916 # Extract URL, uploader, and title from webpage
1917 self.report_extraction(video_id)
1918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1919 if mobj is None:
1920 self._downloader.trouble(u'ERROR: unable to extract media URL')
1921 return
1922 mediaURL = urllib.unquote(mobj.group(1))
1923
1924 video_url = mediaURL
1925
1926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1927 if mobj is None:
1928 self._downloader.trouble(u'ERROR: unable to extract title')
1929 return
1930 video_title = mobj.group(1).decode('utf-8')
1931 video_title = sanitize_title(video_title)
e092418d 1932 simple_title = _simplify_title(vide_title)
235b3ba4
PH
1933
1934 video_uploader = mobj.group(2).decode('utf-8')
1935
1936 try:
1937 # Process video information
1938 self._downloader.process_info({
1939 'id': video_id.decode('utf-8'),
1940 'url': video_url.decode('utf-8'),
1941 'uploader': video_uploader,
1942 'upload_date': u'NA',
1943 'title': video_title,
1944 'stitle': simple_title,
1945 'ext': video_extension.decode('utf-8'),
1946 'format': u'NA',
1947 'player_url': None,
1948 })
1949 except UnavailableVideoError:
1950 self._downloader.trouble(u'\nERROR: unable to download video')
1951
1952
1953class YahooIE(InfoExtractor):
1954 """Information extractor for video.yahoo.com."""
1955
1956 # _VALID_URL matches all Yahoo! Video URLs
1957 # _VPAGE_URL matches only the extractable '/watch/' URLs
1958 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1959 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1960 IE_NAME = u'video.yahoo'
1961
1962 def __init__(self, downloader=None):
1963 InfoExtractor.__init__(self, downloader)
1964
1965 def report_download_webpage(self, video_id):
1966 """Report webpage download."""
1967 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1968
1969 def report_extraction(self, video_id):
1970 """Report information extraction."""
1971 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1972
1973 def _real_extract(self, url, new_video=True):
1974 # Extract ID from URL
1975 mobj = re.match(self._VALID_URL, url)
1976 if mobj is None:
1977 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1978 return
1979
1980 # At this point we have a new video
1981 self._downloader.increment_downloads()
1982 video_id = mobj.group(2)
1983 video_extension = 'flv'
1984
1985 # Rewrite valid but non-extractable URLs as
1986 # extractable English language /watch/ URLs
1987 if re.match(self._VPAGE_URL, url) is None:
1988 request = urllib2.Request(url)
1989 try:
1990 webpage = urllib2.urlopen(request).read()
1991 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1992 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1993 return
1994
1995 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1996 if mobj is None:
1997 self._downloader.trouble(u'ERROR: Unable to extract id field')
1998 return
1999 yahoo_id = mobj.group(1)
2000
2001 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2002 if mobj is None:
2003 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2004 return
2005 yahoo_vid = mobj.group(1)
2006
2007 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2008 return self._real_extract(url, new_video=False)
2009
2010 # Retrieve video webpage to extract further information
2011 request = urllib2.Request(url)
2012 try:
2013 self.report_download_webpage(video_id)
2014 webpage = urllib2.urlopen(request).read()
2015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017 return
2018
2019 # Extract uploader and title from webpage
2020 self.report_extraction(video_id)
2021 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2022 if mobj is None:
2023 self._downloader.trouble(u'ERROR: unable to extract video title')
2024 return
2025 video_title = mobj.group(1).decode('utf-8')
e092418d 2026 simple_title = _simplify_title(video_title)
235b3ba4
PH
2027
2028 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2029 if mobj is None:
2030 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2031 return
2032 video_uploader = mobj.group(1).decode('utf-8')
2033
2034 # Extract video thumbnail
2035 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2036 if mobj is None:
2037 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2038 return
2039 video_thumbnail = mobj.group(1).decode('utf-8')
2040
2041 # Extract video description
2042 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2043 if mobj is None:
2044 self._downloader.trouble(u'ERROR: unable to extract video description')
2045 return
2046 video_description = mobj.group(1).decode('utf-8')
2047 if not video_description:
2048 video_description = 'No description available.'
2049
2050 # Extract video height and width
2051 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2052 if mobj is None:
2053 self._downloader.trouble(u'ERROR: unable to extract video height')
2054 return
2055 yv_video_height = mobj.group(1)
2056
2057 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2058 if mobj is None:
2059 self._downloader.trouble(u'ERROR: unable to extract video width')
2060 return
2061 yv_video_width = mobj.group(1)
2062
2063 # Retrieve video playlist to extract media URL
2064 # I'm not completely sure what all these options are, but we
2065 # seem to need most of them, otherwise the server sends a 401.
2066 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2067 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2068 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2069 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2070 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2071 try:
2072 self.report_download_webpage(video_id)
2073 webpage = urllib2.urlopen(request).read()
2074 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2075 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2076 return
2077
2078 # Extract media URL from playlist XML
2079 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2080 if mobj is None:
2081 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2082 return
2083 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2084 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2085
2086 try:
2087 # Process video information
2088 self._downloader.process_info({
2089 'id': video_id.decode('utf-8'),
2090 'url': video_url,
2091 'uploader': video_uploader,
2092 'upload_date': u'NA',
2093 'title': video_title,
2094 'stitle': simple_title,
2095 'ext': video_extension.decode('utf-8'),
2096 'thumbnail': video_thumbnail.decode('utf-8'),
2097 'description': video_description,
2098 'thumbnail': video_thumbnail,
2099 'player_url': None,
2100 })
2101 except UnavailableVideoError:
2102 self._downloader.trouble(u'\nERROR: unable to download video')
2103
2104
2105class VimeoIE(InfoExtractor):
2106 """Information extractor for vimeo.com."""
2107
2108 # _VALID_URL matches Vimeo URLs
2109 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2110 IE_NAME = u'vimeo'
2111
2112 def __init__(self, downloader=None):
2113 InfoExtractor.__init__(self, downloader)
2114
2115 def report_download_webpage(self, video_id):
2116 """Report webpage download."""
2117 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2118
2119 def report_extraction(self, video_id):
2120 """Report information extraction."""
2121 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2122
2123 def _real_extract(self, url, new_video=True):
2124 # Extract ID from URL
2125 mobj = re.match(self._VALID_URL, url)
2126 if mobj is None:
2127 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2128 return
2129
2130 # At this point we have a new video
2131 self._downloader.increment_downloads()
2132 video_id = mobj.group(1)
2133
2134 # Retrieve video webpage to extract further information
4a34b725 2135 request = urllib2.Request(url, None, std_headers)
235b3ba4
PH
2136 try:
2137 self.report_download_webpage(video_id)
2138 webpage = urllib2.urlopen(request).read()
2139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2141 return
2142
2143 # Now we begin extracting as much information as we can from what we
2144 # retrieved. First we extract the information common to all extractors,
2145 # and latter we extract those that are Vimeo specific.
2146 self.report_extraction(video_id)
2147
4a34b725
PH
2148 # Extract the config JSON
2149 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2150 try:
2151 config = json.loads(config)
2152 except:
2153 self._downloader.trouble(u'ERROR: unable to extract info section')
235b3ba4 2154 return
4a34b725
PH
2155
2156 # Extract title
2157 video_title = config["video"]["title"]
fa2672f9 2158 simple_title = _simplify_title(video_title)
235b3ba4
PH
2159
2160 # Extract uploader
4a34b725 2161 video_uploader = config["video"]["owner"]["name"]
235b3ba4
PH
2162
2163 # Extract video thumbnail
4a34b725 2164 video_thumbnail = config["video"]["thumbnail"]
235b3ba4 2165
4a34b725
PH
2166 # Extract video description
2167 try:
2168 lxml.etree
2169 except NameError:
2170 video_description = u'No description available.'
2171 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2172 if mobj is not None:
2173 video_description = mobj.group(1)
235b3ba4 2174 else:
4a34b725
PH
2175 html_parser = lxml.etree.HTMLParser()
2176 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2177 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2178 # TODO use another parser
235b3ba4 2179
4a34b725
PH
2180 # Extract upload date
2181 video_upload_date = u'NA'
2182 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2183 if mobj is not None:
2184 video_upload_date = mobj.group(1)
2185
2186 # Vimeo specific: extract request signature and timestamp
2187 sig = config['request']['signature']
2188 timestamp = config['request']['timestamp']
2189
2190 # Vimeo specific: extract video codec and quality information
2191 # TODO bind to format param
2192 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2193 for codec in codecs:
2194 if codec[0] in config["video"]["files"]:
2195 video_codec = codec[0]
2196 video_extension = codec[1]
2197 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2198 else: quality = 'sd'
2199 break
2200 else:
2201 self._downloader.trouble(u'ERROR: no known codec found')
235b3ba4 2202 return
235b3ba4 2203
4a34b725
PH
2204 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2205 %(video_id, sig, timestamp, quality, video_codec.upper())
235b3ba4
PH
2206
2207 try:
2208 # Process video information
2209 self._downloader.process_info({
4a34b725 2210 'id': video_id,
235b3ba4
PH
2211 'url': video_url,
2212 'uploader': video_uploader,
4a34b725 2213 'upload_date': video_upload_date,
235b3ba4
PH
2214 'title': video_title,
2215 'stitle': simple_title,
4a34b725 2216 'ext': video_extension,
235b3ba4
PH
2217 'thumbnail': video_thumbnail,
2218 'description': video_description,
2219 'player_url': None,
2220 })
2221 except UnavailableVideoError:
2222 self._downloader.trouble(u'ERROR: unable to download video')
2223
2224
2225class GenericIE(InfoExtractor):
2226 """Generic last-resort information extractor."""
2227
2228 _VALID_URL = r'.*'
2229 IE_NAME = u'generic'
2230
2231 def __init__(self, downloader=None):
2232 InfoExtractor.__init__(self, downloader)
2233
2234 def report_download_webpage(self, video_id):
2235 """Report webpage download."""
2236 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2237 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2238
2239 def report_extraction(self, video_id):
2240 """Report information extraction."""
2241 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2242
2243 def _real_extract(self, url):
2244 # At this point we have a new video
2245 self._downloader.increment_downloads()
2246
2247 video_id = url.split('/')[-1]
2248 request = urllib2.Request(url)
2249 try:
2250 self.report_download_webpage(video_id)
2251 webpage = urllib2.urlopen(request).read()
2252 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2254 return
2255 except ValueError, err:
2256 # since this is the last-resort InfoExtractor, if
2257 # this error is thrown, it'll be thrown here
2258 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2259 return
2260
2261 self.report_extraction(video_id)
2262 # Start with something easy: JW Player in SWFObject
2263 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2264 if mobj is None:
2265 # Broaden the search a little bit
2266 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2267 if mobj is None:
2268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2269 return
2270
2271 # It's possible that one of the regexes
2272 # matched, but returned an empty group:
2273 if mobj.group(1) is None:
2274 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2275 return
2276
2277 video_url = urllib.unquote(mobj.group(1))
2278 video_id = os.path.basename(video_url)
2279
2280 # here's a fun little line of code for you:
2281 video_extension = os.path.splitext(video_id)[1][1:]
2282 video_id = os.path.splitext(video_id)[0]
2283
2284 # it's tempting to parse this further, but you would
2285 # have to take into account all the variations like
2286 # Video Title - Site Name
2287 # Site Name | Video Title
2288 # Video Title - Tagline | Site Name
2289 # and so on and so forth; it's just not practical
2290 mobj = re.search(r'<title>(.*)</title>', webpage)
2291 if mobj is None:
2292 self._downloader.trouble(u'ERROR: unable to extract title')
2293 return
2294 video_title = mobj.group(1).decode('utf-8')
2295 video_title = sanitize_title(video_title)
e092418d 2296 simple_title = _simplify_title(video_title)
235b3ba4
PH
2297
2298 # video uploader is domain name
2299 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2300 if mobj is None:
2301 self._downloader.trouble(u'ERROR: unable to extract title')
2302 return
2303 video_uploader = mobj.group(1).decode('utf-8')
2304
2305 try:
2306 # Process video information
2307 self._downloader.process_info({
2308 'id': video_id.decode('utf-8'),
2309 'url': video_url.decode('utf-8'),
2310 'uploader': video_uploader,
2311 'upload_date': u'NA',
2312 'title': video_title,
2313 'stitle': simple_title,
2314 'ext': video_extension.decode('utf-8'),
2315 'format': u'NA',
2316 'player_url': None,
2317 })
2318 except UnavailableVideoError, err:
2319 self._downloader.trouble(u'\nERROR: unable to download video')
2320
2321
2322class YoutubeSearchIE(InfoExtractor):
2323 """Information Extractor for YouTube search queries."""
2324 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
4a34b725 2325 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
235b3ba4
PH
2326 _youtube_ie = None
2327 _max_youtube_results = 1000
2328 IE_NAME = u'youtube:search'
2329
2330 def __init__(self, youtube_ie, downloader=None):
2331 InfoExtractor.__init__(self, downloader)
2332 self._youtube_ie = youtube_ie
2333
2334 def report_download_page(self, query, pagenum):
2335 """Report attempt to download playlist page with given number."""
2336 query = query.decode(preferredencoding())
2337 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2338
2339 def _real_initialize(self):
2340 self._youtube_ie.initialize()
2341
2342 def _real_extract(self, query):
2343 mobj = re.match(self._VALID_URL, query)
2344 if mobj is None:
2345 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2346 return
2347
2348 prefix, query = query.split(':')
2349 prefix = prefix[8:]
2350 query = query.encode('utf-8')
2351 if prefix == '':
2352 self._download_n_results(query, 1)
2353 return
2354 elif prefix == 'all':
2355 self._download_n_results(query, self._max_youtube_results)
2356 return
2357 else:
2358 try:
2359 n = long(prefix)
2360 if n <= 0:
2361 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2362 return
2363 elif n > self._max_youtube_results:
2364 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2365 n = self._max_youtube_results
2366 self._download_n_results(query, n)
2367 return
2368 except ValueError: # parsing prefix as integer fails
2369 self._download_n_results(query, 1)
2370 return
2371
2372 def _download_n_results(self, query, n):
2373 """Downloads a specified number of results for a query"""
2374
2375 video_ids = []
4a34b725
PH
2376 pagenum = 0
2377 limit = n
235b3ba4 2378
4a34b725
PH
2379 while (50 * pagenum) < limit:
2380 self.report_download_page(query, pagenum+1)
2381 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
235b3ba4
PH
2382 request = urllib2.Request(result_url)
2383 try:
4a34b725 2384 data = urllib2.urlopen(request).read()
235b3ba4 2385 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4a34b725 2386 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
235b3ba4 2387 return
4a34b725 2388 api_response = json.loads(data)['data']
235b3ba4 2389
4a34b725
PH
2390 new_ids = list(video['id'] for video in api_response['items'])
2391 video_ids += new_ids
2392
2393 limit = min(n, api_response['totalItems'])
2394 pagenum += 1
235b3ba4 2395
4a34b725
PH
2396 if len(video_ids) > n:
2397 video_ids = video_ids[:n]
2398 for id in video_ids:
2399 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2400 return
235b3ba4
PH
2401
2402
2403class GoogleSearchIE(InfoExtractor):
2404 """Information Extractor for Google Video search queries."""
2405 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2406 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
7e45ec57
PH
2407 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2408 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
235b3ba4
PH
2409 _google_ie = None
2410 _max_google_results = 1000
2411 IE_NAME = u'video.google:search'
2412
2413 def __init__(self, google_ie, downloader=None):
2414 InfoExtractor.__init__(self, downloader)
2415 self._google_ie = google_ie
2416
2417 def report_download_page(self, query, pagenum):
2418 """Report attempt to download playlist page with given number."""
2419 query = query.decode(preferredencoding())
2420 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2421
2422 def _real_initialize(self):
2423 self._google_ie.initialize()
2424
2425 def _real_extract(self, query):
2426 mobj = re.match(self._VALID_URL, query)
2427 if mobj is None:
2428 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2429 return
2430
2431 prefix, query = query.split(':')
2432 prefix = prefix[8:]
2433 query = query.encode('utf-8')
2434 if prefix == '':
2435 self._download_n_results(query, 1)
2436 return
2437 elif prefix == 'all':
2438 self._download_n_results(query, self._max_google_results)
2439 return
2440 else:
2441 try:
2442 n = long(prefix)
2443 if n <= 0:
2444 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2445 return
2446 elif n > self._max_google_results:
2447 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2448 n = self._max_google_results
2449 self._download_n_results(query, n)
2450 return
2451 except ValueError: # parsing prefix as integer fails
2452 self._download_n_results(query, 1)
2453 return
2454
2455 def _download_n_results(self, query, n):
2456 """Downloads a specified number of results for a query"""
2457
2458 video_ids = []
7e45ec57 2459 pagenum = 0
235b3ba4
PH
2460
2461 while True:
2462 self.report_download_page(query, pagenum)
7e45ec57 2463 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
235b3ba4
PH
2464 request = urllib2.Request(result_url)
2465 try:
2466 page = urllib2.urlopen(request).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469 return
2470
2471 # Extract video identifiers
2472 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2473 video_id = mobj.group(1)
7e45ec57 2474 if video_id not in video_ids:
235b3ba4 2475 video_ids.append(video_id)
235b3ba4
PH
2476 if len(video_ids) == n:
2477 # Specified n videos reached
2478 for id in video_ids:
2479 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2480 return
2481
2482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483 for id in video_ids:
2484 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2485 return
2486
2487 pagenum = pagenum + 1
2488
2489
2490class YahooSearchIE(InfoExtractor):
2491 """Information Extractor for Yahoo! Video search queries."""
2492 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2493 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2494 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2495 _MORE_PAGES_INDICATOR = r'\s*Next'
2496 _yahoo_ie = None
2497 _max_yahoo_results = 1000
2498 IE_NAME = u'video.yahoo:search'
2499
2500 def __init__(self, yahoo_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._yahoo_ie = yahoo_ie
2503
2504 def report_download_page(self, query, pagenum):
2505 """Report attempt to download playlist page with given number."""
2506 query = query.decode(preferredencoding())
2507 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2508
2509 def _real_initialize(self):
2510 self._yahoo_ie.initialize()
2511
2512 def _real_extract(self, query):
2513 mobj = re.match(self._VALID_URL, query)
2514 if mobj is None:
2515 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2516 return
2517
2518 prefix, query = query.split(':')
2519 prefix = prefix[8:]
2520 query = query.encode('utf-8')
2521 if prefix == '':
2522 self._download_n_results(query, 1)
2523 return
2524 elif prefix == 'all':
2525 self._download_n_results(query, self._max_yahoo_results)
2526 return
2527 else:
2528 try:
2529 n = long(prefix)
2530 if n <= 0:
2531 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2532 return
2533 elif n > self._max_yahoo_results:
2534 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2535 n = self._max_yahoo_results
2536 self._download_n_results(query, n)
2537 return
2538 except ValueError: # parsing prefix as integer fails
2539 self._download_n_results(query, 1)
2540 return
2541
2542 def _download_n_results(self, query, n):
2543 """Downloads a specified number of results for a query"""
2544
2545 video_ids = []
2546 already_seen = set()
2547 pagenum = 1
2548
2549 while True:
2550 self.report_download_page(query, pagenum)
2551 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2552 request = urllib2.Request(result_url)
2553 try:
2554 page = urllib2.urlopen(request).read()
2555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2557 return
2558
2559 # Extract video identifiers
2560 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561 video_id = mobj.group(1)
2562 if video_id not in already_seen:
2563 video_ids.append(video_id)
2564 already_seen.add(video_id)
2565 if len(video_ids) == n:
2566 # Specified n videos reached
2567 for id in video_ids:
2568 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2569 return
2570
2571 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2572 for id in video_ids:
2573 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2574 return
2575
2576 pagenum = pagenum + 1
2577
2578
2579class YoutubePlaylistIE(InfoExtractor):
2580 """Information Extractor for YouTube playlists."""
2581
2582 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2583 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
a67bdc34 2584 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
235b3ba4
PH
2585 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2586 _youtube_ie = None
2587 IE_NAME = u'youtube:playlist'
2588
2589 def __init__(self, youtube_ie, downloader=None):
2590 InfoExtractor.__init__(self, downloader)
2591 self._youtube_ie = youtube_ie
2592
2593 def report_download_page(self, playlist_id, pagenum):
2594 """Report attempt to download playlist page with given number."""
2595 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2596
2597 def _real_initialize(self):
2598 self._youtube_ie.initialize()
2599
2600 def _real_extract(self, url):
2601 # Extract playlist id
2602 mobj = re.match(self._VALID_URL, url)
2603 if mobj is None:
2604 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2605 return
2606
2607 # Single video case
2608 if mobj.group(3) is not None:
2609 self._youtube_ie.extract(mobj.group(3))
2610 return
2611
2612 # Download playlist pages
2613 # prefix is 'p' as default for playlists but there are other types that need extra care
2614 playlist_prefix = mobj.group(1)
2615 if playlist_prefix == 'a':
2616 playlist_access = 'artist'
2617 else:
2618 playlist_prefix = 'p'
2619 playlist_access = 'view_play_list'
2620 playlist_id = mobj.group(2)
2621 video_ids = []
2622 pagenum = 1
2623
2624 while True:
2625 self.report_download_page(playlist_id, pagenum)
2626 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2627 request = urllib2.Request(url)
2628 try:
2629 page = urllib2.urlopen(request).read()
2630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2632 return
2633
2634 # Extract video identifiers
2635 ids_in_page = []
a67bdc34 2636 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
235b3ba4
PH
2637 if mobj.group(1) not in ids_in_page:
2638 ids_in_page.append(mobj.group(1))
2639 video_ids.extend(ids_in_page)
2640
2641 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2642 break
2643 pagenum = pagenum + 1
2644
2645 playliststart = self._downloader.params.get('playliststart', 1) - 1
2646 playlistend = self._downloader.params.get('playlistend', -1)
a67bdc34
PH
2647 if playlistend == -1:
2648 video_ids = video_ids[playliststart:]
2649 else:
2650 video_ids = video_ids[playliststart:playlistend]
235b3ba4
PH
2651
2652 for id in video_ids:
2653 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2654 return
2655
2656
2657class YoutubeUserIE(InfoExtractor):
2658 """Information Extractor for YouTube users."""
2659
2660 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2661 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2662 _GDATA_PAGE_SIZE = 50
2663 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2664 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2665 _youtube_ie = None
2666 IE_NAME = u'youtube:user'
2667
2668 def __init__(self, youtube_ie, downloader=None):
2669 InfoExtractor.__init__(self, downloader)
2670 self._youtube_ie = youtube_ie
2671
2672 def report_download_page(self, username, start_index):
2673 """Report attempt to download user page."""
2674 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2675 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2676
2677 def _real_initialize(self):
2678 self._youtube_ie.initialize()
2679
2680 def _real_extract(self, url):
2681 # Extract username
2682 mobj = re.match(self._VALID_URL, url)
2683 if mobj is None:
2684 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2685 return
2686
2687 username = mobj.group(1)
2688
2689 # Download video ids using YouTube Data API. Result size per
2690 # query is limited (currently to 50 videos) so we need to query
2691 # page by page until there are no video ids - it means we got
2692 # all of them.
2693
2694 video_ids = []
2695 pagenum = 0
2696
2697 while True:
2698 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2699 self.report_download_page(username, start_index)
2700
2701 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2702
2703 try:
2704 page = urllib2.urlopen(request).read()
2705 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2706 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2707 return
2708
2709 # Extract video identifiers
2710 ids_in_page = []
2711
2712 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2713 if mobj.group(1) not in ids_in_page:
2714 ids_in_page.append(mobj.group(1))
2715
2716 video_ids.extend(ids_in_page)
2717
2718 # A little optimization - if current page is not
2719 # "full", ie. does not contain PAGE_SIZE video ids then
2720 # we can assume that this page is the last one - there
2721 # are no more ids on further pages - no need to query
2722 # again.
2723
2724 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2725 break
2726
2727 pagenum += 1
2728
2729 all_ids_count = len(video_ids)
2730 playliststart = self._downloader.params.get('playliststart', 1) - 1
2731 playlistend = self._downloader.params.get('playlistend', -1)
2732
2733 if playlistend == -1:
2734 video_ids = video_ids[playliststart:]
2735 else:
2736 video_ids = video_ids[playliststart:playlistend]
2737
6d58c454 2738 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
235b3ba4
PH
2739 (username, all_ids_count, len(video_ids)))
2740
2741 for video_id in video_ids:
2742 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2743
2744
2745class DepositFilesIE(InfoExtractor):
2746 """Information extractor for depositfiles.com"""
2747
2748 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2749 IE_NAME = u'DepositFiles'
2750
2751 def __init__(self, downloader=None):
2752 InfoExtractor.__init__(self, downloader)
2753
2754 def report_download_webpage(self, file_id):
2755 """Report webpage download."""
2756 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2757
2758 def report_extraction(self, file_id):
2759 """Report information extraction."""
2760 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2761
2762 def _real_extract(self, url):
2763 # At this point we have a new file
2764 self._downloader.increment_downloads()
2765
2766 file_id = url.split('/')[-1]
2767 # Rebuild url in english locale
2768 url = 'http://depositfiles.com/en/files/' + file_id
2769
2770 # Retrieve file webpage with 'Free download' button pressed
2771 free_download_indication = { 'gateway_result' : '1' }
2772 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2773 try:
2774 self.report_download_webpage(file_id)
2775 webpage = urllib2.urlopen(request).read()
2776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2777 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2778 return
2779
2780 # Search for the real file URL
2781 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2782 if (mobj is None) or (mobj.group(1) is None):
2783 # Try to figure out reason of the error.
2784 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2785 if (mobj is not None) and (mobj.group(1) is not None):
2786 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2787 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2788 else:
2789 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2790 return
2791
2792 file_url = mobj.group(1)
2793 file_extension = os.path.splitext(file_url)[1][1:]
2794
2795 # Search for file title
2796 mobj = re.search(r'<b title="(.*?)">', webpage)
2797 if mobj is None:
2798 self._downloader.trouble(u'ERROR: unable to extract title')
2799 return
2800 file_title = mobj.group(1).decode('utf-8')
2801
2802 try:
2803 # Process file information
2804 self._downloader.process_info({
2805 'id': file_id.decode('utf-8'),
2806 'url': file_url.decode('utf-8'),
2807 'uploader': u'NA',
2808 'upload_date': u'NA',
2809 'title': file_title,
2810 'stitle': file_title,
2811 'ext': file_extension.decode('utf-8'),
2812 'format': u'NA',
2813 'player_url': None,
2814 })
2815 except UnavailableVideoError, err:
2816 self._downloader.trouble(u'ERROR: unable to download file')
2817
2818
2819class FacebookIE(InfoExtractor):
2820 """Information Extractor for Facebook"""
2821
2822 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2823 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2824 _NETRC_MACHINE = 'facebook'
2825 _available_formats = ['video', 'highqual', 'lowqual']
2826 _video_extensions = {
2827 'video': 'mp4',
2828 'highqual': 'mp4',
2829 'lowqual': 'mp4',
2830 }
2831 IE_NAME = u'facebook'
2832
2833 def __init__(self, downloader=None):
2834 InfoExtractor.__init__(self, downloader)
2835
2836 def _reporter(self, message):
2837 """Add header and report message."""
2838 self._downloader.to_screen(u'[facebook] %s' % message)
2839
2840 def report_login(self):
2841 """Report attempt to log in."""
2842 self._reporter(u'Logging in')
2843
2844 def report_video_webpage_download(self, video_id):
2845 """Report attempt to download video webpage."""
2846 self._reporter(u'%s: Downloading video webpage' % video_id)
2847
2848 def report_information_extraction(self, video_id):
2849 """Report attempt to extract video information."""
2850 self._reporter(u'%s: Extracting video information' % video_id)
2851
2852 def _parse_page(self, video_webpage):
2853 """Extract video information from page"""
2854 # General data
2855 data = {'title': r'\("video_title", "(.*?)"\)',
2856 'description': r'<div class="datawrap">(.*?)</div>',
2857 'owner': r'\("video_owner_name", "(.*?)"\)',
2858 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2859 }
2860 video_info = {}
2861 for piece in data.keys():
2862 mobj = re.search(data[piece], video_webpage)
2863 if mobj is not None:
2864 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2865
2866 # Video urls
2867 video_urls = {}
2868 for fmt in self._available_formats:
2869 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2870 if mobj is not None:
2871 # URL is in a Javascript segment inside an escaped Unicode format within
2872 # the generally utf-8 page
2873 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2874 video_info['video_urls'] = video_urls
2875
2876 return video_info
2877
2878 def _real_initialize(self):
2879 if self._downloader is None:
2880 return
2881
2882 useremail = None
2883 password = None
2884 downloader_params = self._downloader.params
2885
2886 # Attempt to use provided username and password or .netrc data
2887 if downloader_params.get('username', None) is not None:
2888 useremail = downloader_params['username']
2889 password = downloader_params['password']
2890 elif downloader_params.get('usenetrc', False):
2891 try:
2892 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2893 if info is not None:
2894 useremail = info[0]
2895 password = info[2]
2896 else:
2897 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2898 except (IOError, netrc.NetrcParseError), err:
2899 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2900 return
2901
2902 if useremail is None:
2903 return
2904
2905 # Log in
2906 login_form = {
2907 'email': useremail,
2908 'pass': password,
2909 'login': 'Log+In'
2910 }
2911 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2912 try:
2913 self.report_login()
2914 login_results = urllib2.urlopen(request).read()
2915 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2916 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2917 return
2918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2919 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2920 return
2921
2922 def _real_extract(self, url):
2923 mobj = re.match(self._VALID_URL, url)
2924 if mobj is None:
2925 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2926 return
2927 video_id = mobj.group('ID')
2928
2929 # Get video webpage
2930 self.report_video_webpage_download(video_id)
2931 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2932 try:
2933 page = urllib2.urlopen(request)
2934 video_webpage = page.read()
2935 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2936 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2937 return
2938
2939 # Start extracting information
2940 self.report_information_extraction(video_id)
2941
2942 # Extract information
2943 video_info = self._parse_page(video_webpage)
2944
2945 # uploader
2946 if 'owner' not in video_info:
2947 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2948 return
2949 video_uploader = video_info['owner']
2950
2951 # title
2952 if 'title' not in video_info:
2953 self._downloader.trouble(u'ERROR: unable to extract video title')
2954 return
2955 video_title = video_info['title']
2956 video_title = video_title.decode('utf-8')
2957 video_title = sanitize_title(video_title)
2958
e092418d 2959 simple_title = _simplify_title(video_title)
235b3ba4
PH
2960
2961 # thumbnail image
2962 if 'thumbnail' not in video_info:
2963 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2964 video_thumbnail = ''
2965 else:
2966 video_thumbnail = video_info['thumbnail']
2967
2968 # upload date
2969 upload_date = u'NA'
2970 if 'upload_date' in video_info:
2971 upload_time = video_info['upload_date']
2972 timetuple = email.utils.parsedate_tz(upload_time)
2973 if timetuple is not None:
2974 try:
2975 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2976 except:
2977 pass
2978
2979 # description
2980 video_description = video_info.get('description', 'No description available.')
2981
2982 url_map = video_info['video_urls']
2983 if len(url_map.keys()) > 0:
2984 # Decide which formats to download
2985 req_format = self._downloader.params.get('format', None)
2986 format_limit = self._downloader.params.get('format_limit', None)
2987
2988 if format_limit is not None and format_limit in self._available_formats:
2989 format_list = self._available_formats[self._available_formats.index(format_limit):]
2990 else:
2991 format_list = self._available_formats
2992 existing_formats = [x for x in format_list if x in url_map]
2993 if len(existing_formats) == 0:
2994 self._downloader.trouble(u'ERROR: no known formats available for video')
2995 return
2996 if req_format is None:
2997 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2998 elif req_format == 'worst':
2999 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3000 elif req_format == '-1':
3001 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3002 else:
3003 # Specific format
3004 if req_format not in url_map:
3005 self._downloader.trouble(u'ERROR: requested format not available')
3006 return
3007 video_url_list = [(req_format, url_map[req_format])] # Specific format
3008
3009 for format_param, video_real_url in video_url_list:
3010
3011 # At this point we have a new video
3012 self._downloader.increment_downloads()
3013
3014 # Extension
3015 video_extension = self._video_extensions.get(format_param, 'mp4')
3016
3017 try:
3018 # Process video information
3019 self._downloader.process_info({
3020 'id': video_id.decode('utf-8'),
3021 'url': video_real_url.decode('utf-8'),
3022 'uploader': video_uploader.decode('utf-8'),
3023 'upload_date': upload_date,
3024 'title': video_title,
3025 'stitle': simple_title,
3026 'ext': video_extension.decode('utf-8'),
3027 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3028 'thumbnail': video_thumbnail.decode('utf-8'),
3029 'description': video_description.decode('utf-8'),
3030 'player_url': None,
3031 })
3032 except UnavailableVideoError, err:
3033 self._downloader.trouble(u'\nERROR: unable to download video')
3034
3035class BlipTVIE(InfoExtractor):
3036 """Information extractor for blip.tv"""
3037
3038 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3039 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3040 IE_NAME = u'blip.tv'
3041
3042 def report_extraction(self, file_id):
3043 """Report information extraction."""
3044 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3045
3046 def report_direct_download(self, title):
3047 """Report information extraction."""
3048 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3049
235b3ba4
PH
3050 def _real_extract(self, url):
3051 mobj = re.match(self._VALID_URL, url)
3052 if mobj is None:
3053 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3054 return
3055
3056 if '?' in url:
3057 cchar = '&'
3058 else:
3059 cchar = '?'
3060 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3061 request = urllib2.Request(json_url)
3062 self.report_extraction(mobj.group(1))
3063 info = None
3064 try:
3065 urlh = urllib2.urlopen(request)
3066 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3067 basename = url.split('/')[-1]
3068 title,ext = os.path.splitext(basename)
af8e8d63 3069 title = title.decode('UTF-8')
235b3ba4
PH
3070 ext = ext.replace('.', '')
3071 self.report_direct_download(title)
3072 info = {
3073 'id': title,
3074 'url': url,
3075 'title': title,
e092418d 3076 'stitle': _simplify_title(title),
235b3ba4
PH
3077 'ext': ext,
3078 'urlhandle': urlh
3079 }
3080 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3081 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3082 return
3083 if info is None: # Regular URL
3084 try:
3085 json_code = urlh.read()
3086 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3087 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3088 return
3089
3090 try:
3091 json_data = json.loads(json_code)
3092 if 'Post' in json_data:
3093 data = json_data['Post']
3094 else:
3095 data = json_data
3096
3097 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3098 video_url = data['media']['url']
3099 umobj = re.match(self._URL_EXT, video_url)
3100 if umobj is None:
3101 raise ValueError('Can not determine filename extension')
3102 ext = umobj.group(1)
3103
3104 info = {
3105 'id': data['item_id'],
3106 'url': video_url,
3107 'uploader': data['display_name'],
3108 'upload_date': upload_date,
3109 'title': data['title'],
e092418d 3110 'stitle': _simplify_title(data['title']),
235b3ba4
PH
3111 'ext': ext,
3112 'format': data['media']['mimeType'],
3113 'thumbnail': data['thumbnailUrl'],
3114 'description': data['description'],
3115 'player_url': data['embedUrl']
3116 }
3117 except (ValueError,KeyError), err:
3118 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3119 return
3120
3121 self._downloader.increment_downloads()
3122
3123 try:
3124 self._downloader.process_info(info)
3125 except UnavailableVideoError, err:
3126 self._downloader.trouble(u'\nERROR: unable to download video')
3127
3128
3129class MyVideoIE(InfoExtractor):
3130 """Information Extractor for myvideo.de."""
3131
3132 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3133 IE_NAME = u'myvideo'
3134
3135 def __init__(self, downloader=None):
3136 InfoExtractor.__init__(self, downloader)
3137
3138 def report_download_webpage(self, video_id):
3139 """Report webpage download."""
3140 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3141
3142 def report_extraction(self, video_id):
3143 """Report information extraction."""
3144 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3145
3146 def _real_extract(self,url):
3147 mobj = re.match(self._VALID_URL, url)
3148 if mobj is None:
3149 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3150 return
3151
3152 video_id = mobj.group(1)
235b3ba4
PH
3153
3154 # Get video webpage
3155 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3156 try:
3157 self.report_download_webpage(video_id)
3158 webpage = urllib2.urlopen(request).read()
3159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3161 return
3162
3163 self.report_extraction(video_id)
3164 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3165 webpage)
3166 if mobj is None:
3167 self._downloader.trouble(u'ERROR: unable to extract media URL')
3168 return
3169 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3170
3171 mobj = re.search('<title>([^<]+)</title>', webpage)
3172 if mobj is None:
3173 self._downloader.trouble(u'ERROR: unable to extract title')
3174 return
3175
3176 video_title = mobj.group(1)
3177 video_title = sanitize_title(video_title)
3178
e092418d
PH
3179 simple_title = _simplify_title(video_title)
3180
235b3ba4
PH
3181 try:
3182 self._downloader.process_info({
3183 'id': video_id,
3184 'url': video_url,
3185 'uploader': u'NA',
3186 'upload_date': u'NA',
3187 'title': video_title,
3188 'stitle': simple_title,
3189 'ext': u'flv',
3190 'format': u'NA',
3191 'player_url': None,
3192 })
3193 except UnavailableVideoError:
3194 self._downloader.trouble(u'\nERROR: Unable to download video')
3195
3196class ComedyCentralIE(InfoExtractor):
3197 """Information extractor for The Daily Show and Colbert Report """
3198
3199 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3200 IE_NAME = u'comedycentral'
3201
3202 def report_extraction(self, episode_id):
3203 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3204
3205 def report_config_download(self, episode_id):
3206 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3207
3208 def report_index_download(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3210
3211 def report_player_url(self, episode_id):
3212 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3213
235b3ba4
PH
3214 def _real_extract(self, url):
3215 mobj = re.match(self._VALID_URL, url)
3216 if mobj is None:
3217 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3218 return
3219
3220 if mobj.group('shortname'):
3221 if mobj.group('shortname') in ('tds', 'thedailyshow'):
af8e8d63 3222 url = u'http://www.thedailyshow.com/full-episodes/'
235b3ba4 3223 else:
af8e8d63 3224 url = u'http://www.colbertnation.com/full-episodes/'
235b3ba4
PH
3225 mobj = re.match(self._VALID_URL, url)
3226 assert mobj is not None
3227
3228 dlNewest = not mobj.group('episode')
3229 if dlNewest:
3230 epTitle = mobj.group('showname')
3231 else:
3232 epTitle = mobj.group('episode')
3233
3234 req = urllib2.Request(url)
3235 self.report_extraction(epTitle)
3236 try:
3237 htmlHandle = urllib2.urlopen(req)
3238 html = htmlHandle.read()
3239 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3241 return
3242 if dlNewest:
3243 url = htmlHandle.geturl()
3244 mobj = re.match(self._VALID_URL, url)
3245 if mobj is None:
3246 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3247 return
3248 if mobj.group('episode') == '':
3249 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3250 return
3251 epTitle = mobj.group('episode')
3252
c92e184f 3253 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
235b3ba4
PH
3254 if len(mMovieParams) == 0:
3255 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3256 return
3257
3258 playerUrl_raw = mMovieParams[0][0]
3259 self.report_player_url(epTitle)
3260 try:
3261 urlHandle = urllib2.urlopen(playerUrl_raw)
3262 playerUrl = urlHandle.geturl()
3263 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3264 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3265 return
3266
3267 uri = mMovieParams[0][1]
3268 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3269 self.report_index_download(epTitle)
3270 try:
3271 indexXml = urllib2.urlopen(indexUrl).read()
3272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3273 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3274 return
3275
3276 idoc = xml.etree.ElementTree.fromstring(indexXml)
3277 itemEls = idoc.findall('.//item')
3278 for itemEl in itemEls:
3279 mediaId = itemEl.findall('./guid')[0].text
3280 shortMediaId = mediaId.split(':')[-1]
3281 showId = mediaId.split(':')[-2].replace('.com', '')
3282 officialTitle = itemEl.findall('./title')[0].text
3283 officialDate = itemEl.findall('./pubDate')[0].text
3284
3285 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3286 urllib.urlencode({'uri': mediaId}))
3287 configReq = urllib2.Request(configUrl)
3288 self.report_config_download(epTitle)
3289 try:
3290 configXml = urllib2.urlopen(configReq).read()
3291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3292 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3293 return
3294
3295 cdoc = xml.etree.ElementTree.fromstring(configXml)
3296 turls = []
3297 for rendition in cdoc.findall('.//rendition'):
3298 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3299 turls.append(finfo)
3300
3301 if len(turls) == 0:
3302 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3303 continue
3304
3305 # For now, just pick the highest bitrate
3306 format,video_url = turls[-1]
3307
3308 self._downloader.increment_downloads()
3309
af8e8d63 3310 effTitle = showId + u'-' + epTitle
235b3ba4
PH
3311 info = {
3312 'id': shortMediaId,
3313 'url': video_url,
3314 'uploader': showId,
3315 'upload_date': officialDate,
3316 'title': effTitle,
208e095f 3317 'stitle': _simplify_title(effTitle),
235b3ba4
PH
3318 'ext': 'mp4',
3319 'format': format,
3320 'thumbnail': None,
3321 'description': officialTitle,
3322 'player_url': playerUrl
3323 }
3324
3325 try:
3326 self._downloader.process_info(info)
3327 except UnavailableVideoError, err:
3328 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3329 continue
3330
3331
3332class EscapistIE(InfoExtractor):
3333 """Information extractor for The Escapist """
3334
3335 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3336 IE_NAME = u'escapist'
3337
3338 def report_extraction(self, showName):
3339 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3340
3341 def report_config_download(self, showName):
3342 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3343
235b3ba4
PH
3344 def _real_extract(self, url):
3345 htmlParser = HTMLParser.HTMLParser()
3346
3347 mobj = re.match(self._VALID_URL, url)
3348 if mobj is None:
3349 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3350 return
3351 showName = mobj.group('showname')
3352 videoId = mobj.group('episode')
3353
3354 self.report_extraction(showName)
3355 try:
3356 webPage = urllib2.urlopen(url).read()
3357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3359 return
3360
3361 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3362 description = htmlParser.unescape(descMatch.group(1))
3363 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3364 imgUrl = htmlParser.unescape(imgMatch.group(1))
3365 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3366 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3367 configUrlMatch = re.search('config=(.*)$', playerUrl)
3368 configUrl = urllib2.unquote(configUrlMatch.group(1))
3369
3370 self.report_config_download(showName)
3371 try:
3372 configJSON = urllib2.urlopen(configUrl).read()
3373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3375 return
3376
3377 # Technically, it's JavaScript, not JSON
3378 configJSON = configJSON.replace("'", '"')
3379
3380 try:
3381 config = json.loads(configJSON)
3382 except (ValueError,), err:
3383 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3384 return
3385
3386 playlist = config['playlist']
3387 videoUrl = playlist[1]['url']
3388
3389 self._downloader.increment_downloads()
3390 info = {
3391 'id': videoId,
3392 'url': videoUrl,
3393 'uploader': showName,
3394 'upload_date': None,
3395 'title': showName,
e092418d 3396 'stitle': _simplify_title(showName),
235b3ba4
PH
3397 'ext': 'flv',
3398 'format': 'flv',
3399 'thumbnail': imgUrl,
3400 'description': description,
3401 'player_url': playerUrl,
3402 }
3403
3404 try:
3405 self._downloader.process_info(info)
3406 except UnavailableVideoError, err:
3407 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3408
3409
3410class CollegeHumorIE(InfoExtractor):
3411 """Information extractor for collegehumor.com"""
3412
3413 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3414 IE_NAME = u'collegehumor'
3415
3416 def report_webpage(self, video_id):
3417 """Report information extraction."""
3418 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3419
3420 def report_extraction(self, video_id):
3421 """Report information extraction."""
3422 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3423
235b3ba4
PH
3424 def _real_extract(self, url):
3425 htmlParser = HTMLParser.HTMLParser()
3426
3427 mobj = re.match(self._VALID_URL, url)
3428 if mobj is None:
3429 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3430 return
3431 video_id = mobj.group('videoid')
3432
3433 self.report_webpage(video_id)
3434 request = urllib2.Request(url)
3435 try:
3436 webpage = urllib2.urlopen(request).read()
3437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3439 return
3440
3441 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3442 if m is None:
3443 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3444 return
3445 internal_video_id = m.group('internalvideoid')
3446
3447 info = {
3448 'id': video_id,
3449 'internal_id': internal_video_id,
3450 }
3451
3452 self.report_extraction(video_id)
3453 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3454 try:
3455 metaXml = urllib2.urlopen(xmlUrl).read()
3456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3457 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3458 return
3459
3460 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3461 try:
3462 videoNode = mdoc.findall('./video')[0]
3463 info['description'] = videoNode.findall('./description')[0].text
3464 info['title'] = videoNode.findall('./caption')[0].text
e092418d 3465 info['stitle'] = _simplify_title(info['title'])
235b3ba4
PH
3466 info['url'] = videoNode.findall('./file')[0].text
3467 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3468 info['ext'] = info['url'].rpartition('.')[2]
3469 info['format'] = info['ext']
3470 except IndexError:
3471 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3472 return
3473
3474 self._downloader.increment_downloads()
3475
3476 try:
3477 self._downloader.process_info(info)
3478 except UnavailableVideoError, err:
3479 self._downloader.trouble(u'\nERROR: unable to download video')
3480
3481
3482class XVideosIE(InfoExtractor):
3483 """Information extractor for xvideos.com"""
3484
3485 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3486 IE_NAME = u'xvideos'
3487
3488 def report_webpage(self, video_id):
3489 """Report information extraction."""
3490 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3491
3492 def report_extraction(self, video_id):
3493 """Report information extraction."""
3494 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3495
235b3ba4
PH
3496 def _real_extract(self, url):
3497 htmlParser = HTMLParser.HTMLParser()
3498
3499 mobj = re.match(self._VALID_URL, url)
3500 if mobj is None:
3501 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3502 return
3503 video_id = mobj.group(1).decode('utf-8')
3504
3505 self.report_webpage(video_id)
3506
3507 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3508 try:
3509 webpage = urllib2.urlopen(request).read()
3510 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3511 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3512 return
3513
3514 self.report_extraction(video_id)
3515
3516
3517 # Extract video URL
3518 mobj = re.search(r'flv_url=(.+?)&', webpage)
3519 if mobj is None:
3520 self._downloader.trouble(u'ERROR: unable to extract video url')
3521 return
3522 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3523
3524
3525 # Extract title
3526 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3527 if mobj is None:
3528 self._downloader.trouble(u'ERROR: unable to extract video title')
3529 return
3530 video_title = mobj.group(1).decode('utf-8')
3531
3532
3533 # Extract video thumbnail
3534 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3535 if mobj is None:
3536 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3537 return
3538 video_thumbnail = mobj.group(1).decode('utf-8')
3539
3540
3541
3542 self._downloader.increment_downloads()
3543 info = {
3544 'id': video_id,
3545 'url': video_url,
3546 'uploader': None,
3547 'upload_date': None,
3548 'title': video_title,
e092418d 3549 'stitle': _simplify_title(video_title),
235b3ba4
PH
3550 'ext': 'flv',
3551 'format': 'flv',
3552 'thumbnail': video_thumbnail,
3553 'description': None,
3554 'player_url': None,
3555 }
3556
3557 try:
3558 self._downloader.process_info(info)
3559 except UnavailableVideoError, err:
3560 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3561
3562
3563class SoundcloudIE(InfoExtractor):
3564 """Information extractor for soundcloud.com
3565 To access the media, the uid of the song and a stream token
3566 must be extracted from the page source and the script must make
3567 a request to media.soundcloud.com/crossdomain.xml. Then
3568 the media can be grabbed by requesting from an url composed
3569 of the stream token and uid
3570 """
3571
3572 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3573 IE_NAME = u'soundcloud'
3574
3575 def __init__(self, downloader=None):
3576 InfoExtractor.__init__(self, downloader)
3577
3578 def report_webpage(self, video_id):
3579 """Report information extraction."""
3580 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3581
3582 def report_extraction(self, video_id):
3583 """Report information extraction."""
3584 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3585
3586 def _real_extract(self, url):
3587 htmlParser = HTMLParser.HTMLParser()
3588
3589 mobj = re.match(self._VALID_URL, url)
3590 if mobj is None:
3591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3592 return
3593
3594 # extract uploader (which is in the url)
3595 uploader = mobj.group(1).decode('utf-8')
3596 # extract simple title (uploader + slug of song title)
3597 slug_title = mobj.group(2).decode('utf-8')
3598 simple_title = uploader + '-' + slug_title
3599
3600 self.report_webpage('%s/%s' % (uploader, slug_title))
3601
3602 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3603 try:
3604 webpage = urllib2.urlopen(request).read()
3605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3606 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3607 return
3608
3609 self.report_extraction('%s/%s' % (uploader, slug_title))
3610
3611 # extract uid and stream token that soundcloud hands out for access
3612 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3613 if mobj:
3614 video_id = mobj.group(1)
3615 stream_token = mobj.group(2)
3616
3617 # extract unsimplified title
3618 mobj = re.search('"title":"(.*?)",', webpage)
3619 if mobj:
3620 title = mobj.group(1)
3621
3622 # construct media url (with uid/token)
3623 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3624 mediaURL = mediaURL % (video_id, stream_token)
3625
3626 # description
3627 description = u'No description available'
3628 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3629 if mobj:
3630 description = mobj.group(1)
3631
3632 # upload date
3633 upload_date = None
3634 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3635 if mobj:
3636 try:
3637 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
dc0a294a 3638 except Exception, e:
235b3ba4
PH
3639 print str(e)
3640
3641 # for soundcloud, a request to a cross domain is required for cookies
3642 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3643
3644 try:
3645 self._downloader.process_info({
3646 'id': video_id.decode('utf-8'),
3647 'url': mediaURL,
3648 'uploader': uploader.decode('utf-8'),
3649 'upload_date': upload_date,
3650 'title': simple_title.decode('utf-8'),
3651 'stitle': simple_title.decode('utf-8'),
3652 'ext': u'mp3',
3653 'format': u'NA',
3654 'player_url': None,
3655 'description': description.decode('utf-8')
3656 })
3657 except UnavailableVideoError:
3658 self._downloader.trouble(u'\nERROR: unable to download video')
3659
3660
3661class InfoQIE(InfoExtractor):
3662 """Information extractor for infoq.com"""
3663
3664 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3665 IE_NAME = u'infoq'
3666
3667 def report_webpage(self, video_id):
3668 """Report information extraction."""
3669 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3670
3671 def report_extraction(self, video_id):
3672 """Report information extraction."""
3673 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3674
235b3ba4
PH
3675 def _real_extract(self, url):
3676 htmlParser = HTMLParser.HTMLParser()
3677
3678 mobj = re.match(self._VALID_URL, url)
3679 if mobj is None:
3680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3681 return
3682
3683 self.report_webpage(url)
3684
3685 request = urllib2.Request(url)
3686 try:
3687 webpage = urllib2.urlopen(request).read()
3688 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3689 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3690 return
3691
3692 self.report_extraction(url)
3693
3694
3695 # Extract video URL
3696 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3697 if mobj is None:
3698 self._downloader.trouble(u'ERROR: unable to extract video url')
3699 return
3700 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3701
3702
3703 # Extract title
3704 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3705 if mobj is None:
3706 self._downloader.trouble(u'ERROR: unable to extract video title')
3707 return
3708 video_title = mobj.group(1).decode('utf-8')
3709
3710 # Extract description
3711 video_description = u'No description available.'
3712 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3713 if mobj is not None:
3714 video_description = mobj.group(1).decode('utf-8')
3715
3716 video_filename = video_url.split('/')[-1]
3717 video_id, extension = video_filename.split('.')
3718
3719 self._downloader.increment_downloads()
3720 info = {
3721 'id': video_id,
3722 'url': video_url,
3723 'uploader': None,
3724 'upload_date': None,
3725 'title': video_title,
e092418d 3726 'stitle': _simplify_title(video_title),
235b3ba4
PH
3727 'ext': extension,
3728 'format': extension, # Extension is always(?) mp4, but seems to be flv
3729 'thumbnail': None,
3730 'description': video_description,
3731 'player_url': None,
3732 }
3733
3734 try:
3735 self._downloader.process_info(info)
3736 except UnavailableVideoError, err:
3737 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3738
849edab8
PH
3739class MixcloudIE(InfoExtractor):
3740 """Information extractor for www.mixcloud.com"""
3741 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3742 IE_NAME = u'mixcloud'
3743
3744 def __init__(self, downloader=None):
3745 InfoExtractor.__init__(self, downloader)
3746
3747 def report_download_json(self, file_id):
3748 """Report JSON download."""
3749 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3750
3751 def report_extraction(self, file_id):
3752 """Report information extraction."""
3753 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3754
3755 def get_urls(self, jsonData, fmt, bitrate='best'):
3756 """Get urls from 'audio_formats' section in json"""
3757 file_url = None
3758 try:
3759 bitrate_list = jsonData[fmt]
3760 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3761 bitrate = max(bitrate_list) # select highest
3762
3763 url_list = jsonData[fmt][bitrate]
3764 except TypeError: # we have no bitrate info.
3765 url_list = jsonData[fmt]
3766
3767 return url_list
3768
3769 def check_urls(self, url_list):
3770 """Returns 1st active url from list"""
3771 for url in url_list:
3772 try:
3773 urllib2.urlopen(url)
3774 return url
3775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3776 url = None
3777
3778 return None
3779
3780 def _print_formats(self, formats):
3781 print 'Available formats:'
3782 for fmt in formats.keys():
3783 for b in formats[fmt]:
3784 try:
3785 ext = formats[fmt][b][0]
3786 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3787 except TypeError: # we have no bitrate info
3788 ext = formats[fmt][0]
3789 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3790 break
3791
3792 def _real_extract(self, url):
3793 mobj = re.match(self._VALID_URL, url)
3794 if mobj is None:
3795 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3796 return
3797 # extract uploader & filename from url
3798 uploader = mobj.group(1).decode('utf-8')
3799 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3800
3801 # construct API request
3802 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3803 # retrieve .json file with links to files
3804 request = urllib2.Request(file_url)
3805 try:
3806 self.report_download_json(file_url)
3807 jsonData = urllib2.urlopen(request).read()
3808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3809 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3810 return
3811
3812 # parse JSON
3813 json_data = json.loads(jsonData)
3814 player_url = json_data['player_swf_url']
3815 formats = dict(json_data['audio_formats'])
3816
3817 req_format = self._downloader.params.get('format', None)
3818 bitrate = None
3819
3820 if self._downloader.params.get('listformats', None):
3821 self._print_formats(formats)
3822 return
3823
3824 if req_format is None or req_format == 'best':
3825 for format_param in formats.keys():
3826 url_list = self.get_urls(formats, format_param)
3827 # check urls
3828 file_url = self.check_urls(url_list)
3829 if file_url is not None:
3830 break # got it!
3831 else:
3832 if req_format not in formats.keys():
3833 self._downloader.trouble(u'ERROR: format is not available')
3834 return
3835
3836 url_list = self.get_urls(formats, req_format)
3837 file_url = self.check_urls(url_list)
3838 format_param = req_format
3839
3840 # We have audio
3841 self._downloader.increment_downloads()
3842 try:
3843 # Process file information
3844 self._downloader.process_info({
fefb166c
PH
3845 'id': file_id.decode('utf-8'),
3846 'url': file_url.decode('utf-8'),
849edab8 3847 'uploader': uploader.decode('utf-8'),
fefb166c
PH
3848 'upload_date': u'NA',
3849 'title': json_data['name'],
3850 'stitle': _simplify_title(json_data['name']),
3851 'ext': file_url.split('.')[-1].decode('utf-8'),
3852 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3853 'thumbnail': json_data['thumbnail_url'],
3854 'description': json_data['description'],
3855 'player_url': player_url.decode('utf-8'),
849edab8
PH
3856 })
3857 except UnavailableVideoError, err:
3858 self._downloader.trouble(u'ERROR: unable to download file')
3859
dd17922a
PH
3860class StanfordOpenClassroomIE(InfoExtractor):
3861 """Information extractor for Stanford's Open ClassRoom"""
3862
0b14e0b3 3863 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
dd17922a
PH
3864 IE_NAME = u'stanfordoc'
3865
0b14e0b3
PH
3866 def report_download_webpage(self, objid):
3867 """Report information extraction."""
3868 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3869
dd17922a
PH
3870 def report_extraction(self, video_id):
3871 """Report information extraction."""
3872 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3873
3874 def _real_extract(self, url):
3875 mobj = re.match(self._VALID_URL, url)
3876 if mobj is None:
3877 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3878 return
3879
3880 if mobj.group('course') and mobj.group('video'): # A specific video
3881 course = mobj.group('course')
3882 video = mobj.group('video')
3883 info = {
3884 'id': _simplify_title(course + '_' + video),
3885 }
3886
3887 self.report_extraction(info['id'])
3888 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3889 xmlUrl = baseUrl + video + '.xml'
3890 try:
3891 metaXml = urllib2.urlopen(xmlUrl).read()
3892 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
0b14e0b3 3893 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
dd17922a
PH
3894 return
3895 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3896 try:
3897 info['title'] = mdoc.findall('./title')[0].text
3898 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3899 except IndexError:
3900 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3901 return
3902 info['stitle'] = _simplify_title(info['title'])
3903 info['ext'] = info['url'].rpartition('.')[2]
3904 info['format'] = info['ext']
3905 self._downloader.increment_downloads()
3906 try:
3907 self._downloader.process_info(info)
3908 except UnavailableVideoError, err:
3909 self._downloader.trouble(u'\nERROR: unable to download video')
0b14e0b3
PH
3910 elif mobj.group('course'): # A course page
3911 unescapeHTML = HTMLParser.HTMLParser().unescape
dd17922a 3912
0b14e0b3
PH
3913 course = mobj.group('course')
3914 info = {
3915 'id': _simplify_title(course),
3916 'type': 'playlist',
3917 }
dd17922a 3918
0b14e0b3
PH
3919 self.report_download_webpage(info['id'])
3920 try:
3921 coursepage = urllib2.urlopen(url).read()
3922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3923 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3924 return
dd17922a 3925
0b14e0b3
PH
3926 m = re.search('<h1>([^<]+)</h1>', coursepage)
3927 if m:
3928 info['title'] = unescapeHTML(m.group(1))
3929 else:
3930 info['title'] = info['id']
3931 info['stitle'] = _simplify_title(info['title'])
3932
3933 m = re.search('<description>([^<]+)</description>', coursepage)
3934 if m:
3935 info['description'] = unescapeHTML(m.group(1))
3936
3937 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3938 info['list'] = [
3939 {
3940 'type': 'reference',
3941 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3942 }
3943 for vpage in links]
3944
3945 for entry in info['list']:
3946 assert entry['type'] == 'reference'
3947 self.extract(entry['url'])
3948 else: # Root page
3949 unescapeHTML = HTMLParser.HTMLParser().unescape
3950
3951 info = {
3952 'id': 'Stanford OpenClassroom',
3953 'type': 'playlist',
3954 }
3955
3956 self.report_download_webpage(info['id'])
3957 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3958 try:
3959 rootpage = urllib2.urlopen(rootURL).read()
3960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3961 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3962 return
3963
3964 info['title'] = info['id']
3965 info['stitle'] = _simplify_title(info['title'])
3966
3967 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3968 info['list'] = [
3969 {
3970 'type': 'reference',
3971 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3972 }
3973 for cpage in links]
dd17922a 3974
0b14e0b3
PH
3975 for entry in info['list']:
3976 assert entry['type'] == 'reference'
3977 self.extract(entry['url'])
235b3ba4 3978
dcb3c22e 3979class MTVIE(InfoExtractor):
fefb166c
PH
3980 """Information extractor for MTV.com"""
3981
3982 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3983 IE_NAME = u'mtv'
3984
3985 def report_webpage(self, video_id):
3986 """Report information extraction."""
3987 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3988
3989 def report_extraction(self, video_id):
3990 """Report information extraction."""
3991 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3992
3993 def _real_extract(self, url):
3994 mobj = re.match(self._VALID_URL, url)
3995 if mobj is None:
3996 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3997 return
3998 if not mobj.group('proto'):
3999 url = 'http://' + url
4000 video_id = mobj.group('videoid')
4001 self.report_webpage(video_id)
4002
4003 request = urllib2.Request(url)
4004 try:
4005 webpage = urllib2.urlopen(request).read()
4006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4007 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4008 return
4009
4010 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4011 if mobj is None:
4012 self._downloader.trouble(u'ERROR: unable to extract song name')
4013 return
4014 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4015 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4016 if mobj is None:
4017 self._downloader.trouble(u'ERROR: unable to extract performer')
4018 return
4019 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4020 video_title = performer + ' - ' + song_name
4021
4022 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4023 if mobj is None:
4024 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4025 return
4026 mtvn_uri = mobj.group(1)
4027
4028 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4029 if mobj is None:
4030 self._downloader.trouble(u'ERROR: unable to extract content id')
4031 return
4032 content_id = mobj.group(1)
4033
4034 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4035 self.report_extraction(video_id)
4036 request = urllib2.Request(videogen_url)
4037 try:
4038 metadataXml = urllib2.urlopen(request).read()
4039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4040 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4041 return
4042
4043 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4044 renditions = mdoc.findall('.//rendition')
4045
4046 # For now, always pick the highest quality.
4047 rendition = renditions[-1]
4048
4049 try:
4050 _,_,ext = rendition.attrib['type'].partition('/')
4051 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4052 video_url = rendition.find('./src').text
4053 except KeyError:
4054 self._downloader.trouble('Invalid rendition field.')
4055 return
4056
4057 self._downloader.increment_downloads()
4058 info = {
4059 'id': video_id,
4060 'url': video_url,
4061 'uploader': performer,
4062 'title': video_title,
4063 'stitle': _simplify_title(video_title),
4064 'ext': ext,
4065 'format': format,
4066 }
4067
4068 try:
4069 self._downloader.process_info(info)
4070 except UnavailableVideoError, err:
4071 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
dcb3c22e 4072
235b3ba4
PH
4073
4074class PostProcessor(object):
4075 """Post Processor class.
4076
4077 PostProcessor objects can be added to downloaders with their
4078 add_post_processor() method. When the downloader has finished a
4079 successful download, it will take its internal chain of PostProcessors
4080 and start calling the run() method on each one of them, first with
4081 an initial argument and then with the returned value of the previous
4082 PostProcessor.
4083
4084 The chain will be stopped if one of them ever returns None or the end
4085 of the chain is reached.
4086
4087 PostProcessor objects follow a "mutual registration" process similar
4088 to InfoExtractor objects.
4089 """
4090
4091 _downloader = None
4092
4093 def __init__(self, downloader=None):
4094 self._downloader = downloader
4095
4096 def set_downloader(self, downloader):
4097 """Sets the downloader for this PP."""
4098 self._downloader = downloader
4099
4100 def run(self, information):
4101 """Run the PostProcessor.
4102
4103 The "information" argument is a dictionary like the ones
4104 composed by InfoExtractors. The only difference is that this
4105 one has an extra field called "filepath" that points to the
4106 downloaded file.
4107
4108 When this method returns None, the postprocessing chain is
4109 stopped. However, this method may return an information
4110 dictionary that will be passed to the next postprocessing
4111 object in the chain. It can be the one it received after
4112 changing some fields.
4113
4114 In addition, this method may raise a PostProcessingError
4115 exception that will be taken into account by the downloader
4116 it was called from.
4117 """
4118 return information # by default, do nothing
4119
633cf7cb
PH
4120class AudioConversionError(BaseException):
4121 def __init__(self, message):
4122 self.message = message
235b3ba4
PH
4123
4124class FFmpegExtractAudioPP(PostProcessor):
4125
4126 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4127 PostProcessor.__init__(self, downloader)
4128 if preferredcodec is None:
4129 preferredcodec = 'best'
4130 self._preferredcodec = preferredcodec
4131 self._preferredquality = preferredquality
4132 self._keepvideo = keepvideo
4133
4134 @staticmethod
4135 def get_audio_codec(path):
4136 try:
fefb166c 4137 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
235b3ba4
PH
4138 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4139 output = handle.communicate()[0]
4140 if handle.wait() != 0:
4141 return None
4142 except (IOError, OSError):
4143 return None
4144 audio_codec = None
4145 for line in output.split('\n'):
4146 if line.startswith('codec_name='):
4147 audio_codec = line.split('=')[1].strip()
4148 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4149 return audio_codec
4150 return None
4151
4152 @staticmethod
4153 def run_ffmpeg(path, out_path, codec, more_opts):
633cf7cb
PH
4154 if codec is None:
4155 acodec_opts = []
4156 else:
4157 acodec_opts = ['-acodec', codec]
fefb166c 4158 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
235b3ba4 4159 try:
633cf7cb
PH
4160 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4161 stdout,stderr = p.communicate()
235b3ba4 4162 except (IOError, OSError):
633cf7cb
PH
4163 e = sys.exc_info()[1]
4164 if isinstance(e, OSError) and e.errno == 2:
4165 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4166 else:
4167 raise e
4168 if p.returncode != 0:
4169 msg = stderr.strip().split('\n')[-1]
4170 raise AudioConversionError(msg)
235b3ba4
PH
4171
4172 def run(self, information):
4173 path = information['filepath']
4174
4175 filecodec = self.get_audio_codec(path)
4176 if filecodec is None:
4177 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4178 return None
4179
4180 more_opts = []
3e0ea7d0
PH
4181 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4182 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4183 # Lossless, but in another container
4184 acodec = 'copy'
4185 extension = self._preferredcodec
4186 more_opts = ['-absf', 'aac_adtstoasc']
4187 elif filecodec in ['aac', 'mp3', 'vorbis']:
235b3ba4
PH
4188 # Lossless if possible
4189 acodec = 'copy'
4190 extension = filecodec
4191 if filecodec == 'aac':
4192 more_opts = ['-f', 'adts']
4193 if filecodec == 'vorbis':
4194 extension = 'ogg'
4195 else:
4196 # MP3 otherwise.
4197 acodec = 'libmp3lame'
4198 extension = 'mp3'
4199 more_opts = []
4200 if self._preferredquality is not None:
4201 more_opts += ['-ab', self._preferredquality]
4202 else:
4203 # We convert the audio (lossy)
633cf7cb 4204 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
235b3ba4
PH
4205 extension = self._preferredcodec
4206 more_opts = []
4207 if self._preferredquality is not None:
4208 more_opts += ['-ab', self._preferredquality]
4209 if self._preferredcodec == 'aac':
4210 more_opts += ['-f', 'adts']
3e0ea7d0
PH
4211 if self._preferredcodec == 'm4a':
4212 more_opts += ['-absf', 'aac_adtstoasc']
235b3ba4
PH
4213 if self._preferredcodec == 'vorbis':
4214 extension = 'ogg'
633cf7cb
PH
4215 if self._preferredcodec == 'wav':
4216 extension = 'wav'
4217 more_opts += ['-f', 'wav']
235b3ba4 4218
fefb166c
PH
4219 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4220 new_path = prefix + sep + extension
4221 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
633cf7cb
PH
4222 try:
4223 self.run_ffmpeg(path, new_path, acodec, more_opts)
4224 except:
4225 etype,e,tb = sys.exc_info()
4226 if isinstance(e, AudioConversionError):
4227 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4228 else:
4229 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
235b3ba4
PH
4230 return None
4231
4232 # Try to update the date time for extracted audio file.
4233 if information.get('filetime') is not None:
4234 try:
fefb166c 4235 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
235b3ba4
PH
4236 except:
4237 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4238
4239 if not self._keepvideo:
4240 try:
fefb166c 4241 os.remove(_encodeFilename(path))
235b3ba4
PH
4242 except (IOError, OSError):
4243 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4244 return None
4245
4246 information['filepath'] = new_path
4247 return information
4248
4249
4250def updateSelf(downloader, filename):
4251 ''' Update the program file with the latest version from the repository '''
4252 # Note: downloader only used for options
4253 if not os.access(filename, os.W_OK):
4254 sys.exit('ERROR: no write permissions on %s' % filename)
4255
6d58c454 4256 downloader.to_screen(u'Updating to latest version...')
235b3ba4
PH
4257
4258 try:
4259 try:
4260 urlh = urllib.urlopen(UPDATE_URL)
4261 newcontent = urlh.read()
4262
4263 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4264 if vmatch is not None and vmatch.group(1) == __version__:
6d58c454 4265 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
235b3ba4
PH
4266 return
4267 finally:
4268 urlh.close()
4269 except (IOError, OSError), err:
4270 sys.exit('ERROR: unable to download latest version')
4271
4272 try:
4273 outf = open(filename, 'wb')
4274 try:
4275 outf.write(newcontent)
4276 finally:
4277 outf.close()
4278 except (IOError, OSError), err:
4279 sys.exit('ERROR: unable to overwrite current version')
4280
6d58c454 4281 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
235b3ba4
PH
4282
4283def parseOpts():
fefb166c 4284 def _readOptions(filename_bytes):
c379c181 4285 try:
fefb166c 4286 optionf = open(filename_bytes)
c379c181
PH
4287 except IOError:
4288 return [] # silently skip if file is not present
4289 try:
4290 res = []
4291 for l in optionf:
4292 res += shlex.split(l, comments=True)
4293 finally:
4294 optionf.close()
4295 return res
235b3ba4
PH
4296
4297 def _format_option_string(option):
4298 ''' ('-o', '--option') -> -o, --format METAVAR'''
4299
4300 opts = []
4301
4302 if option._short_opts: opts.append(option._short_opts[0])
4303 if option._long_opts: opts.append(option._long_opts[0])
4304 if len(opts) > 1: opts.insert(1, ', ')
4305
4306 if option.takes_value(): opts.append(' %s' % option.metavar)
4307
4308 return "".join(opts)
4309
4310 def _find_term_columns():
4311 columns = os.environ.get('COLUMNS', None)
4312 if columns:
4313 return int(columns)
4314
4315 try:
4316 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4317 out,err = sp.communicate()
4318 return int(out.split()[1])
4319 except:
4320 pass
4321 return None
4322
4323 max_width = 80
4324 max_help_position = 80
4325
4326 # No need to wrap help messages if we're on a wide console
4327 columns = _find_term_columns()
4328 if columns: max_width = columns
4329
4330 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4331 fmt.format_option_strings = _format_option_string
4332
4333 kw = {
4334 'version' : __version__,
4335 'formatter' : fmt,
4336 'usage' : '%prog [options] url [url...]',
4337 'conflict_handler' : 'resolve',
4338 }
4339
4340 parser = optparse.OptionParser(**kw)
4341
4342 # option groups
4343 general = optparse.OptionGroup(parser, 'General Options')
4344 selection = optparse.OptionGroup(parser, 'Video Selection')
4345 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4346 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4347 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4348 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4349 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4350
4351 general.add_option('-h', '--help',
4352 action='help', help='print this help text and exit')
4353 general.add_option('-v', '--version',
4354 action='version', help='print program version and exit')
4355 general.add_option('-U', '--update',
4356 action='store_true', dest='update_self', help='update this program to latest version')
4357 general.add_option('-i', '--ignore-errors',
4358 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4359 general.add_option('-r', '--rate-limit',
4360 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4361 general.add_option('-R', '--retries',
4362 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4363 general.add_option('--dump-user-agent',
4364 action='store_true', dest='dump_user_agent',
4365 help='display the current browser identification', default=False)
4366 general.add_option('--list-extractors',
4367 action='store_true', dest='list_extractors',
4368 help='List all supported extractors and the URLs they would handle', default=False)
4369
4370 selection.add_option('--playlist-start',
4371 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4372 selection.add_option('--playlist-end',
4373 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4374 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4375 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
b88a5250 4376 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
235b3ba4
PH
4377
4378 authentication.add_option('-u', '--username',
4379 dest='username', metavar='USERNAME', help='account username')
4380 authentication.add_option('-p', '--password',
4381 dest='password', metavar='PASSWORD', help='account password')
4382 authentication.add_option('-n', '--netrc',
4383 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4384
4385
4386 video_format.add_option('-f', '--format',
4387 action='store', dest='format', metavar='FORMAT', help='video format code')
4388 video_format.add_option('--all-formats',
4389 action='store_const', dest='format', help='download all available video formats', const='all')
23e6b8ad
PH
4390 video_format.add_option('--prefer-free-formats',
4391 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
235b3ba4
PH
4392 video_format.add_option('--max-quality',
4393 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4394 video_format.add_option('-F', '--list-formats',
4395 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
a0432a1e
FV
4396 video_format.add_option('--write-srt',
4397 action='store_true', dest='writesubtitles',
4398 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4399 video_format.add_option('--srt-lang',
4400 action='store', dest='subtitleslang', metavar='LANG',
4401 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
235b3ba4
PH
4402
4403
4404 verbosity.add_option('-q', '--quiet',
4405 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4406 verbosity.add_option('-s', '--simulate',
4407 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4408 verbosity.add_option('--skip-download',
4409 action='store_true', dest='skip_download', help='do not download the video', default=False)
4410 verbosity.add_option('-g', '--get-url',
4411 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4412 verbosity.add_option('-e', '--get-title',
4413 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4414 verbosity.add_option('--get-thumbnail',
4415 action='store_true', dest='getthumbnail',
4416 help='simulate, quiet but print thumbnail URL', default=False)
4417 verbosity.add_option('--get-description',
4418 action='store_true', dest='getdescription',
4419 help='simulate, quiet but print video description', default=False)
4420 verbosity.add_option('--get-filename',
4421 action='store_true', dest='getfilename',
4422 help='simulate, quiet but print output filename', default=False)
4423 verbosity.add_option('--get-format',
4424 action='store_true', dest='getformat',
4425 help='simulate, quiet but print output format', default=False)
4426 verbosity.add_option('--no-progress',
4427 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4428 verbosity.add_option('--console-title',
4429 action='store_true', dest='consoletitle',
4430 help='display progress in console titlebar', default=False)
09fbc6c9
PH
4431 verbosity.add_option('-v', '--verbose',
4432 action='store_true', dest='verbose', help='print various debugging information', default=False)
235b3ba4
PH
4433
4434
4435 filesystem.add_option('-t', '--title',
4436 action='store_true', dest='usetitle', help='use title in file name', default=False)
4437 filesystem.add_option('-l', '--literal',
4438 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4439 filesystem.add_option('-A', '--auto-number',
4440 action='store_true', dest='autonumber',
4441 help='number downloaded files starting from 00000', default=False)
4442 filesystem.add_option('-o', '--output',
c95da745 4443 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
235b3ba4
PH
4444 filesystem.add_option('-a', '--batch-file',
4445 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4446 filesystem.add_option('-w', '--no-overwrites',
4447 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4448 filesystem.add_option('-c', '--continue',
1ad85e50 4449 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
235b3ba4
PH
4450 filesystem.add_option('--no-continue',
4451 action='store_false', dest='continue_dl',
4452 help='do not resume partially downloaded files (restart from beginning)')
4453 filesystem.add_option('--cookies',
4454 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4455 filesystem.add_option('--no-part',
4456 action='store_true', dest='nopart', help='do not use .part files', default=False)
4457 filesystem.add_option('--no-mtime',
4458 action='store_false', dest='updatetime',
4459 help='do not use the Last-modified header to set the file modification time', default=True)
4460 filesystem.add_option('--write-description',
4461 action='store_true', dest='writedescription',
4462 help='write video description to a .description file', default=False)
4463 filesystem.add_option('--write-info-json',
4464 action='store_true', dest='writeinfojson',
4465 help='write video metadata to a .info.json file', default=False)
4466
4467
4468 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4469 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4470 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
633cf7cb 4471 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
235b3ba4
PH
4472 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4473 help='ffmpeg audio bitrate specification, 128k by default')
4474 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4475 help='keeps the video file on disk after the post-processing; the video is erased by default')
4476
4477
4478 parser.add_option_group(general)
4479 parser.add_option_group(selection)
4480 parser.add_option_group(filesystem)
4481 parser.add_option_group(verbosity)
4482 parser.add_option_group(video_format)
4483 parser.add_option_group(authentication)
4484 parser.add_option_group(postproc)
4485
0cd235ee
PH
4486 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4487 if xdg_config_home:
4488 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4489 else:
4490 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4491 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
c379c181 4492 opts, args = parser.parse_args(argv)
235b3ba4
PH
4493
4494 return parser, opts, args
4495
4496def gen_extractors():
4497 """ Return a list of an instance of every supported extractor.
4498 The order does matter; the first extractor matched is the one handling the URL.
4499 """
4500 youtube_ie = YoutubeIE()
4501 google_ie = GoogleIE()
4502 yahoo_ie = YahooIE()
4503 return [
4504 YoutubePlaylistIE(youtube_ie),
4505 YoutubeUserIE(youtube_ie),
4506 YoutubeSearchIE(youtube_ie),
4507 youtube_ie,
4508 MetacafeIE(youtube_ie),
4509 DailymotionIE(),
4510 google_ie,
4511 GoogleSearchIE(google_ie),
4512 PhotobucketIE(),
4513 yahoo_ie,
4514 YahooSearchIE(yahoo_ie),
4515 DepositFilesIE(),
4516 FacebookIE(),
4517 BlipTVIE(),
4518 VimeoIE(),
4519 MyVideoIE(),
4520 ComedyCentralIE(),
4521 EscapistIE(),
4522 CollegeHumorIE(),
4523 XVideosIE(),
4524 SoundcloudIE(),
4525 InfoQIE(),
849edab8 4526 MixcloudIE(),
dd17922a 4527 StanfordOpenClassroomIE(),
dcb3c22e 4528 MTVIE(),
235b3ba4
PH
4529
4530 GenericIE()
4531 ]
4532
4533def _real_main():
4534 parser, opts, args = parseOpts()
4535
4536 # Open appropriate CookieJar
4537 if opts.cookiefile is None:
4538 jar = cookielib.CookieJar()
4539 else:
4540 try:
4541 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4542 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4543 jar.load()
4544 except (IOError, OSError), err:
4545 sys.exit(u'ERROR: unable to open cookie file')
4546
4547 # Dump user agent
4548 if opts.dump_user_agent:
4549 print std_headers['User-Agent']
4550 sys.exit(0)
4551
4552 # Batch file verification
4553 batchurls = []
4554 if opts.batchfile is not None:
4555 try:
4556 if opts.batchfile == '-':
4557 batchfd = sys.stdin
4558 else:
4559 batchfd = open(opts.batchfile, 'r')
4560 batchurls = batchfd.readlines()
4561 batchurls = [x.strip() for x in batchurls]
4562 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4563 except IOError:
4564 sys.exit(u'ERROR: batch file could not be read')
4565 all_urls = batchurls + args
4566
4567 # General configuration
4568 cookie_processor = urllib2.HTTPCookieProcessor(jar)
09fbc6c9
PH
4569 proxy_handler = urllib2.ProxyHandler()
4570 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
235b3ba4
PH
4571 urllib2.install_opener(opener)
4572 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4573
09fbc6c9
PH
4574 if opts.verbose:
4575 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4576
235b3ba4
PH
4577 extractors = gen_extractors()
4578
4579 if opts.list_extractors:
4580 for ie in extractors:
4581 print(ie.IE_NAME)
4582 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4583 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4584 for mu in matchedUrls:
4585 print(u' ' + mu)
4586 sys.exit(0)
4587
4588 # Conflicting, missing and erroneous options
4589 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4590 parser.error(u'using .netrc conflicts with giving username/password')
4591 if opts.password is not None and opts.username is None:
4592 parser.error(u'account username missing')
4593 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4594 parser.error(u'using output template conflicts with using title, literal title or auto number')
4595 if opts.usetitle and opts.useliteral:
4596 parser.error(u'using title conflicts with using literal title')
4597 if opts.username is not None and opts.password is None:
4598 opts.password = getpass.getpass(u'Type account password and press return:')
4599 if opts.ratelimit is not None:
4600 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4601 if numeric_limit is None:
4602 parser.error(u'invalid rate limit specified')
4603 opts.ratelimit = numeric_limit
4604 if opts.retries is not None:
4605 try:
4606 opts.retries = long(opts.retries)
4607 except (TypeError, ValueError), err:
4608 parser.error(u'invalid retry count specified')
4609 try:
4610 opts.playliststart = int(opts.playliststart)
4611 if opts.playliststart <= 0:
4612 raise ValueError(u'Playlist start must be positive')
4613 except (TypeError, ValueError), err:
4614 parser.error(u'invalid playlist start number specified')
4615 try:
4616 opts.playlistend = int(opts.playlistend)
4617 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4618 raise ValueError(u'Playlist end must be greater than playlist start')
4619 except (TypeError, ValueError), err:
4620 parser.error(u'invalid playlist end number specified')
4621 if opts.extractaudio:
633cf7cb 4622 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
235b3ba4
PH
4623 parser.error(u'invalid audio format specified')
4624
4625 # File downloader
4626 fd = FileDownloader({
4627 'usenetrc': opts.usenetrc,
4628 'username': opts.username,
4629 'password': opts.password,
4630 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4631 'forceurl': opts.geturl,
4632 'forcetitle': opts.gettitle,
4633 'forcethumbnail': opts.getthumbnail,
4634 'forcedescription': opts.getdescription,
4635 'forcefilename': opts.getfilename,
4636 'forceformat': opts.getformat,
4637 'simulate': opts.simulate,
4638 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4639 'format': opts.format,
4640 'format_limit': opts.format_limit,
4641 'listformats': opts.listformats,
4642 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4643 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4644 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4645 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4646 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4647 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4648 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4649 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4650 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4651 or u'%(id)s.%(ext)s'),
4652 'ignoreerrors': opts.ignoreerrors,
4653 'ratelimit': opts.ratelimit,
4654 'nooverwrites': opts.nooverwrites,
4655 'retries': opts.retries,
4656 'continuedl': opts.continue_dl,
4657 'noprogress': opts.noprogress,
4658 'playliststart': opts.playliststart,
4659 'playlistend': opts.playlistend,
4660 'logtostderr': opts.outtmpl == '-',
4661 'consoletitle': opts.consoletitle,
4662 'nopart': opts.nopart,
4663 'updatetime': opts.updatetime,
4664 'writedescription': opts.writedescription,
4665 'writeinfojson': opts.writeinfojson,
a0432a1e
FV
4666 'writesubtitles': opts.writesubtitles,
4667 'subtitleslang': opts.subtitleslang,
235b3ba4
PH
4668 'matchtitle': opts.matchtitle,
4669 'rejecttitle': opts.rejecttitle,
c379c181 4670 'max_downloads': opts.max_downloads,
23e6b8ad 4671 'prefer_free_formats': opts.prefer_free_formats,
871dbd3c 4672 'verbose': opts.verbose,
235b3ba4
PH
4673 })
4674 for extractor in extractors:
4675 fd.add_info_extractor(extractor)
4676
4677 # PostProcessors
4678 if opts.extractaudio:
4679 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4680
4681 # Update version
4682 if opts.update_self:
4683 updateSelf(fd, sys.argv[0])
4684
4685 # Maybe do nothing
4686 if len(all_urls) < 1:
4687 if not opts.update_self:
4688 parser.error(u'you must provide at least one URL')
4689 else:
4690 sys.exit()
94fd3201
PH
4691
4692 try:
4693 retcode = fd.download(all_urls)
4694 except MaxDownloadsReached:
4695 fd.to_screen(u'--max-download limit reached, aborting.')
4696 retcode = 101
235b3ba4
PH
4697
4698 # Dump cookie jar if requested
4699 if opts.cookiefile is not None:
4700 try:
4701 jar.save()
4702 except (IOError, OSError), err:
4703 sys.exit(u'ERROR: unable to save cookie jar')
4704
4705 sys.exit(retcode)
4706
4707def main():
4708 try:
4709 _real_main()
4710 except DownloadError:
4711 sys.exit(1)
4712 except SameFileError:
4713 sys.exit(u'ERROR: fixed output name but more than one file to download')
4714 except KeyboardInterrupt:
4715 sys.exit(u'\nERROR: Interrupted by user')
4716
4717if __name__ == '__main__':
4718 main()
4719
4720# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: