]> jfr.im git - yt-dlp.git/blame - youtube-dl
Use freedesktop.org mandated user config file location (Suggested by Tyll in #231)
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
2770590d
GV
3
4__author__ = (
2c8d32de
PH
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
6ae796b1 12 'Rogério Brito',
eb11aacc 13 'Philipp Hagemeister',
6fc5b0bb 14 'Sören Schulze',
aab771fb
PH
15 'Kevin Ngo',
16 'Ori Avtalion',
a95567af 17 'shizeeg',
2770590d
GV
18 )
19
2c8d32de 20__license__ = 'Public Domain'
fa2672f9 21__version__ = '2011.11.23'
2770590d 22
8236e851 23UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
d207e7cf 24
80066952 25import cookielib
a1f03c7b 26import datetime
1987c232 27import gzip
4fa74b52 28import htmlentitydefs
f9c68787 29import HTMLParser
4fa74b52 30import httplib
2546e767 31import locale
4fa74b52
RG
32import math
33import netrc
34import os
35import os.path
36import re
37import socket
38import string
0487b407 39import subprocess
4fa74b52
RG
40import sys
41import time
42import urllib
43import urllib2
c6b55a8d 44import warnings
1987c232 45import zlib
a04e80a4 46
0a3c8b62
PH
47if os.name == 'nt':
48 import ctypes
49
50try:
51 import email.utils
52except ImportError: # Python 2.4
53 import email.Utils
c6b55a8d
PH
54try:
55 import cStringIO as StringIO
56except ImportError:
57 import StringIO
58
a04e80a4
RG
59# parse_qs was moved from the cgi module to the urlparse module recently.
60try:
61 from urlparse import parse_qs
62except ImportError:
63 from cgi import parse_qs
4fa74b52 64
c6b55a8d
PH
65try:
66 import lxml.etree
2b70537d 67except ImportError:
c6b55a8d
PH
68 pass # Handled below
69
c8e30044
PH
70try:
71 import xml.etree.ElementTree
afb5b55d
PH
72except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
c8e30044 74
f995f712 75std_headers = {
c44b9ee9 76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
4fa74b52 77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
a57ed21f 79 'Accept-Encoding': 'gzip, deflate',
4fa74b52
RG
80 'Accept-Language': 'en-us,en;q=0.5',
81}
82
437d76c1
PH
83try:
84 import json
91e6a385 85except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
437d76c1
PH
86 import re
87 class json(object):
88 @staticmethod
89 def loads(s):
90 s = s.decode('UTF-8')
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
95 i += 1
96 if expectMore:
97 if i >= len(s):
98 raiseError('Premature end', i)
99 return i
100 def decodeEscape(match):
101 esc = match.group(1)
102 _STATIC = {
103 '"': '"',
104 '\\': '\\',
105 '/': '/',
106 'b': unichr(0x8),
107 'f': unichr(0xc),
108 'n': '\n',
109 'r': '\r',
110 't': '\t',
111 }
112 if esc in _STATIC:
113 return _STATIC[esc]
114 if esc[0] == 'u':
115 if len(esc) == 1+4:
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
122 def parseString(i):
123 i += 1
124 e = i
125 while True:
126 e = s.index('"', e)
127 bslashes = 0
128 while s[e-bslashes-1] == '\\':
129 bslashes += 1
130 if bslashes % 2 == 1:
131 e += 1
132 continue
133 break
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
136 return (e+1,stri)
137 def parseObj(i):
138 i += 1
139 res = {}
140 i = skipSpace(i)
141 if s[i] == '}': # Empty dictionary
142 return (i+1,res)
143 while True:
144 if s[i] != '"':
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
147 i = skipSpace(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
150 i,val = parse(i+1)
151 res[key] = val
152 i = skipSpace(i)
153 if s[i] == '}':
154 return (i+1, res)
155 if s[i] != ',':
156 raiseError('Expected comma or closing curly brace', i)
157 i = skipSpace(i+1)
158 def parseArray(i):
159 res = []
160 i = skipSpace(i+1)
161 if s[i] == ']': # Empty array
162 return (i+1,res)
163 while True:
164 i,val = parse(i)
165 res.append(val)
166 i = skipSpace(i) # Raise exception if premature end
167 if s[i] == ']':
168 return (i+1, res)
169 if s[i] != ',':
170 raiseError('Expected a comma or closing bracket', i)
171 i = skipSpace(i+1)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
175 return (i+len(k), v)
176 raiseError('Not a boolean (or null)', i)
177 def parseNumber(i):
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 if mobj is None:
180 raiseError('Not a number', i)
181 nums = mobj.group(1)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186 def parse(i):
187 i = skipSpace(i)
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
190 return (i,res)
191 i,res = parse(0)
192 if i < len(s):
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194 return res
195
eae2666c
RG
196def preferredencoding():
197 """Get preferred encoding.
198
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
201 """
f94b636c
RG
202 def yield_preferredencoding():
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
208 while True:
209 yield pref
210 return yield_preferredencoding().next()
eae2666c 211
c0a10ca8 212
490fd7ae
RG
213def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
d3975459 215
490fd7ae
RG
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
218 """
219 entity = matchobj.group(1)
220
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
224
225 # Unicode character
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 if mobj is not None:
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
230 base = 16
231 numstr = u'0%s' % numstr
232 else:
233 base = 10
234 return unichr(long(numstr, base))
235
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
238
c0a10ca8 239
490fd7ae 240def sanitize_title(utitle):
31bcb480 241 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
243 return utitle.replace(unicode(os.sep), u'%')
244
c0a10ca8 245
31bcb480
RG
246def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
248
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
252 function.
253
254 It returns the tuple (stream, definitive_file_name).
255 """
256 try:
131bc765 257 if filename == u'-':
e08878f4
RG
258 if sys.platform == 'win32':
259 import msvcrt
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 261 return (sys.stdout, filename)
31bcb480
RG
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
ca6a11fa 266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
267
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
271
c0a10ca8 272
09bd408c 273def timeconvert(timestr):
c0a10ca8
F
274 """Convert RFC 2822 defined time string into system timestamp"""
275 timestamp = None
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
279 return timestamp
280
e33e3045 281def _simplify_title(title):
468c9925
PH
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
09bd408c 284
e5bf0f55
RG
285class DownloadError(Exception):
286 """Download Error exception.
d3975459 287
e5bf0f55
RG
288 This exception may be thrown by FileDownloader objects if they are not
289 configured to continue on errors. They will contain the appropriate
290 error message.
291 """
292 pass
293
c0a10ca8 294
e5bf0f55
RG
295class SameFileError(Exception):
296 """Same File exception.
297
298 This exception will be thrown by FileDownloader objects if they detect
299 multiple files would have to be downloaded to the same file on disk.
300 """
301 pass
302
c0a10ca8 303
65cd34c5
RG
304class PostProcessingError(Exception):
305 """Post Processing exception.
306
307 This exception may be raised by PostProcessor's .run() method to
308 indicate an error in the postprocessing task.
309 """
310 pass
311
c0a10ca8 312
73f4e7af 313class UnavailableVideoError(Exception):
7b7759f5 314 """Unavailable Format exception.
315
316 This exception will be thrown when a video is requested
317 in a format that is not available for that video.
318 """
d69a1c91
RG
319 pass
320
c0a10ca8 321
d69a1c91
RG
322class ContentTooShortError(Exception):
323 """Content Too Short exception.
324
325 This exception may be raised by FileDownloader objects when a file they
326 download is too small for what the server announced first, indicating
327 the connection was probably interrupted.
328 """
329 # Both in bytes
330 downloaded = None
331 expected = None
332
333 def __init__(self, downloaded, expected):
334 self.downloaded = downloaded
335 self.expected = expected
7b7759f5 336
c0a10ca8 337
1987c232
RG
338class YoutubeDLHandler(urllib2.HTTPHandler):
339 """Handler for HTTP requests and responses.
340
341 This class, when installed with an OpenerDirector, automatically adds
342 the standard headers to every HTTP request and handles gzipped and
343 deflated responses from web servers. If compression is to be avoided in
344 a particular request, the original request in the program code only has
345 to include the HTTP header "Youtubedl-No-Compression", which will be
346 removed before making the real request.
c0a10ca8 347
1987c232
RG
348 Part of this code was copied from:
349
c0a10ca8
F
350 http://techknack.net/python-urllib2-handlers/
351
1987c232
RG
352 Andrew Rowls, the author of that code, agreed to release it to the
353 public domain.
354 """
355
356 @staticmethod
357 def deflate(data):
358 try:
359 return zlib.decompress(data, -zlib.MAX_WBITS)
360 except zlib.error:
361 return zlib.decompress(data)
c0a10ca8 362
7b531c0b
RG
363 @staticmethod
364 def addinfourl_wrapper(stream, headers, url, code):
365 if hasattr(urllib2.addinfourl, 'getcode'):
366 return urllib2.addinfourl(stream, headers, url, code)
0f6b00b5
RG
367 ret = urllib2.addinfourl(stream, headers, url)
368 ret.code = code
369 return ret
c0a10ca8 370
1987c232
RG
371 def http_request(self, req):
372 for h in std_headers:
373 if h in req.headers:
374 del req.headers[h]
375 req.add_header(h, std_headers[h])
376 if 'Youtubedl-no-compression' in req.headers:
377 if 'Accept-encoding' in req.headers:
378 del req.headers['Accept-encoding']
379 del req.headers['Youtubedl-no-compression']
380 return req
381
382 def http_response(self, req, resp):
383 old_resp = resp
384 # gzip
385 if resp.headers.get('Content-encoding', '') == 'gzip':
386 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
7b531c0b 387 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
388 resp.msg = old_resp.msg
389 # deflate
390 if resp.headers.get('Content-encoding', '') == 'deflate':
391 gz = StringIO.StringIO(self.deflate(resp.read()))
7b531c0b 392 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
393 resp.msg = old_resp.msg
394 return resp
395
c0a10ca8 396
4fa74b52
RG
397class FileDownloader(object):
398 """File Downloader class.
399
400 File downloader objects are the ones responsible of downloading the
401 actual video file and writing it to disk if the user has requested
402 it, among some other tasks. In most cases there should be one per
403 program. As, given a video URL, the downloader doesn't know how to
404 extract all the needed information, task that InfoExtractors do, it
405 has to pass the URL to one of them.
406
407 For this, file downloader objects have a method that allows
408 InfoExtractors to be registered in a given order. When it is passed
409 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
410 finds that reports being able to handle it. The InfoExtractor extracts
411 all the information about the video or videos the URL refers to, and
412 asks the FileDownloader to process the video information, possibly
413 downloading the video.
4fa74b52
RG
414
415 File downloaders accept a lot of parameters. In order not to saturate
416 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
417 options instead. These options are available through the params
418 attribute for the InfoExtractors to use. The FileDownloader also
419 registers itself as the downloader in charge for the InfoExtractors
420 that are added to it, so this is a "mutual registration".
4fa74b52
RG
421
422 Available options:
423
80066952
RG
424 username: Username for authentication purposes.
425 password: Password for authentication purposes.
426 usenetrc: Use netrc for authentication instead.
427 quiet: Do not print messages to stdout.
428 forceurl: Force printing final URL.
429 forcetitle: Force printing title.
430 forcethumbnail: Force printing thumbnail URL.
431 forcedescription: Force printing description.
9f796346 432 forcefilename: Force printing final filename.
80066952
RG
433 simulate: Do not download the video files.
434 format: Video format code.
435 format_limit: Highest quality format to try.
436 outtmpl: Template for output names.
437 ignoreerrors: Do not stop on download errors.
438 ratelimit: Download speed limit, in bytes/sec.
439 nooverwrites: Prevent overwriting files.
440 retries: Number of times to retry for HTTP error 5xx
441 continuedl: Try to continue downloads if possible.
442 noprogress: Do not print the progress bar.
443 playliststart: Playlist item to start at.
8cc44341 444 playlistend: Playlist item to end at.
20e91e83
ABP
445 matchtitle: Download only matching titles.
446 rejecttitle: Reject downloads for matching titles.
331ce0a0 447 logtostderr: Log messages to stderr instead of stdout.
ccbd296b 448 consoletitle: Display progress in console window's titlebar.
3fb2c487 449 nopart: Do not use temporary .part files.
e3018902 450 updatetime: Use the Last-modified header to set output file timestamps.
8b95c387 451 writedescription: Write the video description to a .description file
6eb08fbf 452 writeinfojson: Write the video description to a .info.json file
4fa74b52
RG
453 """
454
d0a9affb 455 params = None
4fa74b52 456 _ies = []
65cd34c5 457 _pps = []
9bf386d7 458 _download_retcode = None
7d8d0612 459 _num_downloads = None
331ce0a0 460 _screen_file = None
4fa74b52
RG
461
462 def __init__(self, params):
1c5e2302 463 """Create a FileDownloader object with the given options."""
4fa74b52 464 self._ies = []
65cd34c5 465 self._pps = []
9bf386d7 466 self._download_retcode = 0
7d8d0612 467 self._num_downloads = 0
331ce0a0 468 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 469 self.params = params
d3975459 470
4fa74b52
RG
471 @staticmethod
472 def format_bytes(bytes):
473 if bytes is None:
474 return 'N/A'
8497c36d
RG
475 if type(bytes) is str:
476 bytes = float(bytes)
477 if bytes == 0.0:
4fa74b52
RG
478 exponent = 0
479 else:
8497c36d 480 exponent = long(math.log(bytes, 1024.0))
4fa74b52 481 suffix = 'bkMGTPEZY'[exponent]
c0a10ca8 482 converted = float(bytes) / float(1024 ** exponent)
4fa74b52
RG
483 return '%.2f%s' % (converted, suffix)
484
485 @staticmethod
486 def calc_percent(byte_counter, data_len):
487 if data_len is None:
488 return '---.-%'
489 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
490
491 @staticmethod
492 def calc_eta(start, now, total, current):
493 if total is None:
494 return '--:--'
495 dif = now - start
496 if current == 0 or dif < 0.001: # One millisecond
497 return '--:--'
498 rate = float(current) / dif
499 eta = long((float(total) - float(current)) / rate)
500 (eta_mins, eta_secs) = divmod(eta, 60)
501 if eta_mins > 99:
502 return '--:--'
503 return '%02d:%02d' % (eta_mins, eta_secs)
504
5121ef20 505 @staticmethod
4fa74b52
RG
506 def calc_speed(start, now, bytes):
507 dif = now - start
508 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 509 return '%10s' % '---b/s'
4fa74b52
RG
510 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
511
512 @staticmethod
513 def best_block_size(elapsed_time, bytes):
514 new_min = max(bytes / 2.0, 1.0)
515 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516 if elapsed_time < 0.001:
e1f18b8a 517 return long(new_max)
4fa74b52
RG
518 rate = bytes / elapsed_time
519 if rate > new_max:
e1f18b8a 520 return long(new_max)
4fa74b52 521 if rate < new_min:
e1f18b8a
RG
522 return long(new_min)
523 return long(rate)
4fa74b52 524
acd3d842
RG
525 @staticmethod
526 def parse_bytes(bytestr):
527 """Parse a string indicating a byte quantity into a long integer."""
528 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
529 if matchobj is None:
530 return None
531 number = float(matchobj.group(1))
532 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533 return long(round(number * multiplier))
534
4fa74b52
RG
535 def add_info_extractor(self, ie):
536 """Add an InfoExtractor object to the end of the list."""
537 self._ies.append(ie)
538 ie.set_downloader(self)
d3975459 539
65cd34c5
RG
540 def add_post_processor(self, pp):
541 """Add a PostProcessor object to the end of the chain."""
542 self._pps.append(pp)
543 pp.set_downloader(self)
d3975459 544
331ce0a0 545 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 546 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
547 try:
548 if not self.params.get('quiet', False):
331ce0a0
RG
549 terminator = [u'\n', u''][skip_eol]
550 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551 self._screen_file.flush()
43ab0ca4
RG
552 except (UnicodeEncodeError), err:
553 if not ignore_encoding_errors:
554 raise
d3975459 555
7e5cab67
RG
556 def to_stderr(self, message):
557 """Print message to stderr."""
eae2666c 558 print >>sys.stderr, message.encode(preferredencoding())
d3975459 559
ccbd296b
MM
560 def to_cons_title(self, message):
561 """Set console/terminal window title to message."""
562 if not self.params.get('consoletitle', False):
563 return
564 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565 # c_wchar_p() might not be necessary if `message` is
566 # already of type unicode()
567 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568 elif 'TERM' in os.environ:
569 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
570
22899cea
RG
571 def fixed_template(self):
572 """Checks if the output template is fixed."""
d0a9affb 573 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 574
0086d1ec
RG
575 def trouble(self, message=None):
576 """Determine action to take when a download problem appears.
577
578 Depending on if the downloader has been configured to ignore
e5bf0f55 579 download errors or not, this method may throw an exception or
9bf386d7 580 not when errors are found, after printing the message.
0086d1ec
RG
581 """
582 if message is not None:
583 self.to_stderr(message)
d0a9affb 584 if not self.params.get('ignoreerrors', False):
e5bf0f55 585 raise DownloadError(message)
9bf386d7 586 self._download_retcode = 1
0086d1ec 587
acd3d842
RG
588 def slow_down(self, start_time, byte_counter):
589 """Sleep if the download speed is over the rate limit."""
d0a9affb 590 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
591 if rate_limit is None or byte_counter == 0:
592 return
593 now = time.time()
594 elapsed = now - start_time
595 if elapsed <= 0.0:
596 return
597 speed = float(byte_counter) / elapsed
598 if speed > rate_limit:
599 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
3fb2c487
RG
600
601 def temp_name(self, filename):
602 """Returns a temporary filename for the given filename."""
603 if self.params.get('nopart', False) or filename == u'-' or \
604 (os.path.exists(filename) and not os.path.isfile(filename)):
605 return filename
606 return filename + u'.part'
607
8cc42e7c
RG
608 def undo_temp_name(self, filename):
609 if filename.endswith(u'.part'):
610 return filename[:-len(u'.part')]
611 return filename
612
62cf7aaf
RG
613 def try_rename(self, old_filename, new_filename):
614 try:
7d950ca1
RG
615 if old_filename == new_filename:
616 return
62cf7aaf
RG
617 os.rename(old_filename, new_filename)
618 except (IOError, OSError), err:
619 self.trouble(u'ERROR: unable to rename file')
c0a10ca8 620
e3018902
RG
621 def try_utime(self, filename, last_modified_hdr):
622 """Try to set the last-modified time of the given file."""
623 if last_modified_hdr is None:
624 return
625 if not os.path.isfile(filename):
626 return
627 timestr = last_modified_hdr
628 if timestr is None:
629 return
630 filetime = timeconvert(timestr)
631 if filetime is None:
36597dc4 632 return filetime
e3018902 633 try:
c0a10ca8 634 os.utime(filename, (time.time(), filetime))
e3018902
RG
635 except:
636 pass
36597dc4 637 return filetime
acd3d842 638
8b95c387 639 def report_writedescription(self, descfn):
6eb08fbf
PH
640 """ Report that the description file is being written """
641 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
642
643 def report_writeinfojson(self, infofn):
644 """ Report that the metadata file has been written """
645 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
8b95c387 646
bafa5cd9
RG
647 def report_destination(self, filename):
648 """Report destination filename."""
331ce0a0 649 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
d3975459 650
bafa5cd9
RG
651 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652 """Report download progress."""
d9835247
RG
653 if self.params.get('noprogress', False):
654 return
331ce0a0 655 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 656 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
ccbd296b
MM
657 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
7db85b2c
RG
659
660 def report_resuming_byte(self, resume_len):
8a9f53be 661 """Report attempt to resume at given byte."""
331ce0a0 662 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
d3975459 663
7031008c 664 def report_retry(self, count, retries):
e86e9474 665 """Report retry in case of HTTP error 5xx"""
331ce0a0 666 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
d3975459 667
7db85b2c
RG
668 def report_file_already_downloaded(self, file_name):
669 """Report file has already been fully downloaded."""
43ab0ca4 670 try:
331ce0a0 671 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 672 except (UnicodeEncodeError), err:
331ce0a0 673 self.to_screen(u'[download] The file has already been downloaded')
d3975459 674
7db85b2c
RG
675 def report_unable_to_resume(self):
676 """Report it was impossible to resume download."""
331ce0a0 677 self.to_screen(u'[download] Unable to resume')
d3975459 678
bafa5cd9
RG
679 def report_finish(self):
680 """Report download finished."""
d9835247 681 if self.params.get('noprogress', False):
331ce0a0 682 self.to_screen(u'[download] Download completed')
d9835247 683 else:
331ce0a0 684 self.to_screen(u'')
d3975459 685
df372a65
RG
686 def increment_downloads(self):
687 """Increment the ordinal that assigns a number to each file."""
688 self._num_downloads += 1
bafa5cd9 689
9f796346
GI
690 def prepare_filename(self, info_dict):
691 """Generate the output filename."""
692 try:
693 template_dict = dict(info_dict)
694 template_dict['epoch'] = unicode(long(time.time()))
695 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696 filename = self.params['outtmpl'] % template_dict
697 return filename
698 except (ValueError, KeyError), err:
699 self.trouble(u'ERROR: invalid system charset or erroneous output template')
700 return None
701
0cd235ee
PH
702 def _match_entry(self, info_dict):
703 """ Returns None iff the file should be downloaded """
704
705 title = info_dict['title']
706 matchtitle = self.params.get('matchtitle', False)
707 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
708 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
709 rejecttitle = self.params.get('rejecttitle', False)
710 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
711 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
712 return None
713
c8619e01
RG
714 def process_info(self, info_dict):
715 """Process a single dictionary returned by an InfoExtractor."""
b88a5250 716
0cd235ee
PH
717 reason = self._match_entry(info_dict)
718 if reason is not None:
719 self.to_screen(u'[download] ' + reason)
720 return
721
c379c181 722 max_downloads = self.params.get('max_downloads')
b88a5250 723 if max_downloads is not None:
c379c181 724 if self._num_downloads > int(max_downloads):
b88a5250
PH
725 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
726 return
0cd235ee 727
9f796346 728 filename = self.prepare_filename(info_dict)
9b4556c4
PH
729
730 # Forced printings
731 if self.params.get('forcetitle', False):
732 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
733 if self.params.get('forceurl', False):
734 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
735 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
736 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
737 if self.params.get('forcedescription', False) and 'description' in info_dict:
738 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
739 if self.params.get('forcefilename', False) and filename is not None:
740 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
da0db53a
DH
741 if self.params.get('forceformat', False):
742 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
9b4556c4 743
c8619e01
RG
744 # Do nothing else if in simulate mode
745 if self.params.get('simulate', False):
9bf386d7 746 return
d3975459 747
9f796346 748 if filename is None:
38ed1344 749 return
20e91e83 750
850ab765 751 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 752 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 753 return
7b7759f5 754
c8619e01 755 try:
e5e74ffb
PH
756 dn = os.path.dirname(filename)
757 if dn != '' and not os.path.exists(dn):
758 os.makedirs(dn)
c8619e01 759 except (OSError, IOError), err:
cec3a53c 760 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
9bf386d7 761 return
7b7759f5 762
8b95c387
PH
763 if self.params.get('writedescription', False):
764 try:
765 descfn = filename + '.description'
6eb08fbf 766 self.report_writedescription(descfn)
1293ce58
PH
767 descfile = open(descfn, 'wb')
768 try:
8b95c387 769 descfile.write(info_dict['description'].encode('utf-8'))
1293ce58
PH
770 finally:
771 descfile.close()
8b95c387 772 except (OSError, IOError):
cec3a53c 773 self.trouble(u'ERROR: Cannot write description file ' + descfn)
8b95c387
PH
774 return
775
6eb08fbf
PH
776 if self.params.get('writeinfojson', False):
777 infofn = filename + '.info.json'
778 self.report_writeinfojson(infofn)
779 try:
780 json.dump
781 except (NameError,AttributeError):
782 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
783 return
784 try:
1293ce58
PH
785 infof = open(infofn, 'wb')
786 try:
54f329fe
PH
787 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
788 json.dump(json_info_dict, infof)
1293ce58
PH
789 finally:
790 infof.close()
6eb08fbf 791 except (OSError, IOError):
cec3a53c 792 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
6eb08fbf
PH
793 return
794
9b4556c4 795 if not self.params.get('skip_download', False):
55e7c75e 796 try:
366cbfb0 797 success = self._do_download(filename, info_dict)
9b4556c4
PH
798 except (OSError, IOError), err:
799 raise UnavailableVideoError
800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
801 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
802 return
803 except (ContentTooShortError, ), err:
804 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
55e7c75e 805 return
9b4556c4
PH
806
807 if success:
808 try:
809 self.post_process(filename, info_dict)
810 except (PostProcessingError), err:
811 self.trouble(u'ERROR: postprocessing: %s' % str(err))
812 return
c8619e01 813
4fa74b52
RG
814 def download(self, url_list):
815 """Download a given list of URLs."""
22899cea 816 if len(url_list) > 1 and self.fixed_template():
d0a9affb 817 raise SameFileError(self.params['outtmpl'])
22899cea 818
4fa74b52
RG
819 for url in url_list:
820 suitable_found = False
821 for ie in self._ies:
c8619e01 822 # Go to next InfoExtractor if not suitable
4fa74b52
RG
823 if not ie.suitable(url):
824 continue
c8619e01 825
4fa74b52
RG
826 # Suitable InfoExtractor found
827 suitable_found = True
c8619e01 828
6f21f686
RG
829 # Extract information from URL and process it
830 ie.extract(url)
65cd34c5 831
c8619e01 832 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 833 break
c8619e01 834
4fa74b52 835 if not suitable_found:
db7e31b8 836 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 837
9bf386d7 838 return self._download_retcode
65cd34c5
RG
839
840 def post_process(self, filename, ie_info):
841 """Run the postprocessing chain on the given file."""
842 info = dict(ie_info)
843 info['filepath'] = filename
844 for pp in self._pps:
845 info = pp.run(info)
846 if info is None:
847 break
d3975459 848
e616ec0c 849 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407 850 self.report_destination(filename)
62cf7aaf 851 tmpfilename = self.temp_name(filename)
0487b407
RG
852
853 # Check for rtmpdump first
854 try:
855 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
856 except (OSError, IOError):
857 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
858 return False
859
860 # Download using rtmpdump. rtmpdump returns exit code 2 when
861 # the connection was interrumpted and resuming appears to be
862 # possible. This is part of rtmpdump's normal usage, AFAIK.
b487ef08 863 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
1c1821f8
RG
864 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
865 while retval == 2 or retval == 1:
62cf7aaf 866 prevsize = os.path.getsize(tmpfilename)
331ce0a0 867 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 868 time.sleep(5.0) # This seems to be needed
1c1821f8 869 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
62cf7aaf 870 cursize = os.path.getsize(tmpfilename)
e616ec0c
RG
871 if prevsize == cursize and retval == 1:
872 break
b487ef08
PH
873 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
874 if prevsize == cursize and retval == 2 and cursize > 1024:
875 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
876 retval = 0
877 break
0487b407 878 if retval == 0:
62cf7aaf
RG
879 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
880 self.try_rename(tmpfilename, filename)
0487b407
RG
881 return True
882 else:
db7e31b8 883 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
884 return False
885
366cbfb0
PH
886 def _do_download(self, filename, info_dict):
887 url = info_dict['url']
888 player_url = info_dict.get('player_url', None)
889
62cf7aaf 890 # Check file already present
3fb2c487 891 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
62cf7aaf
RG
892 self.report_file_already_downloaded(filename)
893 return True
894
0487b407
RG
895 # Attempt to download using rtmpdump
896 if url.startswith('rtmp'):
e616ec0c 897 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 898
62cf7aaf 899 tmpfilename = self.temp_name(filename)
55e7c75e 900 stream = None
1987c232
RG
901
902 # Do not include the Accept-Encoding header
903 headers = {'Youtubedl-no-compression': 'True'}
904 basic_request = urllib2.Request(url, None, headers)
905 request = urllib2.Request(url, None, headers)
7db85b2c 906
9c457d2a 907 # Establish possible resume length
62cf7aaf
RG
908 if os.path.isfile(tmpfilename):
909 resume_len = os.path.getsize(tmpfilename)
55e7c75e
RG
910 else:
911 resume_len = 0
9c457d2a 912
10e7194d
MH
913 open_mode = 'wb'
914 if resume_len != 0:
915 if self.params.get('continuedl', False):
916 self.report_resuming_byte(resume_len)
917 request.add_header('Range','bytes=%d-' % resume_len)
918 open_mode = 'ab'
919 else:
920 resume_len = 0
55e7c75e 921
7031008c
RG
922 count = 0
923 retries = self.params.get('retries', 0)
101e0d1e 924 while count <= retries:
7031008c
RG
925 # Establish connection
926 try:
54f329fe
PH
927 if count == 0 and 'urlhandle' in info_dict:
928 data = info_dict['urlhandle']
7031008c
RG
929 data = urllib2.urlopen(request)
930 break
931 except (urllib2.HTTPError, ), err:
ac249f42 932 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 933 # Unexpected HTTP error
7031008c 934 raise
101e0d1e
RG
935 elif err.code == 416:
936 # Unable to resume (requested range not satisfiable)
937 try:
938 # Open the connection again without the range header
939 data = urllib2.urlopen(basic_request)
940 content_length = data.info()['Content-Length']
941 except (urllib2.HTTPError, ), err:
ac249f42 942 if err.code < 500 or err.code >= 600:
101e0d1e
RG
943 raise
944 else:
945 # Examine the reported length
268fb2bd 946 if (content_length is not None and
c0a10ca8 947 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
948 # The file had already been fully downloaded.
949 # Explanation to the above condition: in issue #175 it was revealed that
950 # YouTube sometimes adds or removes a few bytes from the end of the file,
951 # changing the file size slightly and causing problems for some users. So
952 # I decided to implement a suggested change and consider the file
953 # completely downloaded if the file size differs less than 100 bytes from
954 # the one in the hard drive.
101e0d1e 955 self.report_file_already_downloaded(filename)
62cf7aaf 956 self.try_rename(tmpfilename, filename)
101e0d1e
RG
957 return True
958 else:
959 # The length does not match, we start the download over
960 self.report_unable_to_resume()
961 open_mode = 'wb'
962 break
963 # Retry
964 count += 1
965 if count <= retries:
966 self.report_retry(count, retries)
967
968 if count > retries:
969 self.trouble(u'ERROR: giving up after %s retries' % retries)
970 return False
7db85b2c 971
4fa74b52 972 data_len = data.info().get('Content-length', None)
106d091e
RG
973 if data_len is not None:
974 data_len = long(data_len) + resume_len
4fa74b52 975 data_len_str = self.format_bytes(data_len)
106d091e 976 byte_counter = 0 + resume_len
4fa74b52
RG
977 block_size = 1024
978 start = time.time()
979 while True:
bafa5cd9 980 # Download and write
4fa74b52
RG
981 before = time.time()
982 data_block = data.read(block_size)
983 after = time.time()
975a91d0 984 if len(data_block) == 0:
4fa74b52 985 break
975a91d0 986 byte_counter += len(data_block)
55e7c75e
RG
987
988 # Open file just in time
989 if stream is None:
990 try:
62cf7aaf 991 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
dbddab27 992 assert stream is not None
8cc42e7c 993 filename = self.undo_temp_name(tmpfilename)
55e7c75e
RG
994 self.report_destination(filename)
995 except (OSError, IOError), err:
db7e31b8 996 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 997 return False
131efd1a
RG
998 try:
999 stream.write(data_block)
1000 except (IOError, OSError), err:
d67e0974
RG
1001 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1002 return False
975a91d0 1003 block_size = self.best_block_size(after - before, len(data_block))
4fa74b52 1004
55e7c75e 1005 # Progress message
975a91d0 1006 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
67035ede
PH
1007 if data_len is None:
1008 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1009 else:
1010 percent_str = self.calc_percent(byte_counter, data_len)
1011 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1012 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
55e7c75e 1013
acd3d842 1014 # Apply rate limit
975a91d0 1015 self.slow_down(start, byte_counter - resume_len)
acd3d842 1016
dbddab27
PH
1017 if stream is None:
1018 self.trouble(u'\nERROR: Did not get any data blocks')
1019 return False
6f0ff3ba 1020 stream.close()
bafa5cd9 1021 self.report_finish()
b905e5f5 1022 if data_len is not None and byte_counter != data_len:
d69a1c91 1023 raise ContentTooShortError(byte_counter, long(data_len))
62cf7aaf 1024 self.try_rename(tmpfilename, filename)
e3018902 1025
09bd408c 1026 # Update file modification time
e3018902 1027 if self.params.get('updatetime', True):
366cbfb0 1028 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
e3018902 1029
366cbfb0 1030 return True
4fa74b52 1031
c0a10ca8 1032
4fa74b52
RG
1033class InfoExtractor(object):
1034 """Information Extractor class.
1035
1036 Information extractors are the classes that, given a URL, extract
1037 information from the video (or videos) the URL refers to. This
1038 information includes the real video URL, the video title and simplified
2851b2ca
RG
1039 title, author and others. The information is stored in a dictionary
1040 which is then passed to the FileDownloader. The FileDownloader
1041 processes this information possibly downloading the video to the file
1042 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
1043 the following fields:
1044
1045 id: Video identifier.
1046 url: Final video URL.
1047 uploader: Nickname of the video uploader.
1048 title: Literal title.
1049 stitle: Simplified title.
1050 ext: Video filename extension.
6ba562b0 1051 format: Video format.
e616ec0c 1052 player_url: SWF Player URL (may be None).
4fa74b52 1053
7e58d568
RG
1054 The following fields are optional. Their primary purpose is to allow
1055 youtube-dl to serve as the backend for a video search function, such
1056 as the one in youtube2mp3. They are only used when their respective
1057 forced printing functions are called:
1058
1059 thumbnail: Full URL to a video thumbnail image.
1060 description: One-line video description.
1061
4fa74b52 1062 Subclasses of this one should re-define the _real_initialize() and
bdb3f7a7
PH
1063 _real_extract() methods and define a _VALID_URL regexp.
1064 Probably, they should also be added to the list of extractors.
4fa74b52
RG
1065 """
1066
1067 _ready = False
1068 _downloader = None
1069
1070 def __init__(self, downloader=None):
1071 """Constructor. Receives an optional downloader."""
1072 self._ready = False
1073 self.set_downloader(downloader)
1074
bdb3f7a7 1075 def suitable(self, url):
4fa74b52 1076 """Receives a URL and returns True if suitable for this IE."""
bdb3f7a7 1077 return re.match(self._VALID_URL, url) is not None
4fa74b52
RG
1078
1079 def initialize(self):
1c5e2302 1080 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
1081 if not self._ready:
1082 self._real_initialize()
1083 self._ready = True
1084
1085 def extract(self, url):
1086 """Extracts URL information and returns it in list of dicts."""
1087 self.initialize()
1088 return self._real_extract(url)
1089
1090 def set_downloader(self, downloader):
1091 """Sets the downloader for this IE."""
1092 self._downloader = downloader
d3975459 1093
4fa74b52
RG
1094 def _real_initialize(self):
1095 """Real initialization process. Redefine in subclasses."""
1096 pass
1097
1098 def _real_extract(self, url):
1099 """Real extraction process. Redefine in subclasses."""
1100 pass
1101
c0a10ca8 1102
4fa74b52
RG
1103class YoutubeIE(InfoExtractor):
1104 """Information extractor for youtube.com."""
1105
1cde6f1d 1106 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 1107 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 1108 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 1109 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 1110 _NETRC_MACHINE = 'youtube'
497cd3e6 1111 # Listed in order of quality
767414a2 1112 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
7b7759f5 1113 _video_extensions = {
1114 '13': '3gp',
1115 '17': 'mp4',
1116 '18': 'mp4',
1117 '22': 'mp4',
d9bc015b 1118 '37': 'mp4',
9e9647d9 1119 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a 1120 '43': 'webm',
7b417b38 1121 '44': 'webm',
0b59bf4a 1122 '45': 'webm',
7b7759f5 1123 }
3de2a1e6
FT
1124 _video_dimensions = {
1125 '5': '240x400',
1126 '6': '???',
1127 '13': '???',
1128 '17': '144x176',
1129 '18': '360x640',
1130 '22': '720x1280',
1131 '34': '360x640',
1132 '35': '480x854',
1133 '37': '1080x1920',
1134 '38': '3072x4096',
1135 '43': '360x640',
1136 '44': '480x854',
1137 '45': '720x1280',
1138 }
f3098c4d 1139 IE_NAME = u'youtube'
4fa74b52 1140
72ac78b8
RG
1141 def report_lang(self):
1142 """Report attempt to set language."""
331ce0a0 1143 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 1144
bafa5cd9
RG
1145 def report_login(self):
1146 """Report attempt to log in."""
331ce0a0 1147 self._downloader.to_screen(u'[youtube] Logging in')
d3975459 1148
bafa5cd9
RG
1149 def report_age_confirmation(self):
1150 """Report attempt to confirm age."""
331ce0a0 1151 self._downloader.to_screen(u'[youtube] Confirming age')
d3975459 1152
e616ec0c
RG
1153 def report_video_webpage_download(self, video_id):
1154 """Report attempt to download video webpage."""
331ce0a0 1155 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
d3975459 1156
71b7300e
RG
1157 def report_video_info_webpage_download(self, video_id):
1158 """Report attempt to download video info webpage."""
331ce0a0 1159 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
d3975459 1160
bafa5cd9
RG
1161 def report_information_extraction(self, video_id):
1162 """Report attempt to extract video information."""
331ce0a0 1163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
d3975459 1164
7b7759f5 1165 def report_unavailable_format(self, video_id, format):
1166 """Report extracted video URL."""
331ce0a0 1167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
d3975459 1168
0487b407
RG
1169 def report_rtmp_download(self):
1170 """Indicate the download will use the RTMP protocol."""
331ce0a0 1171 self._downloader.to_screen(u'[youtube] RTMP download detected')
d3975459 1172
3de2a1e6
FT
1173 def _print_formats(self, formats):
1174 print 'Available formats:'
1175 for x in formats:
1176 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1177
4fa74b52
RG
1178 def _real_initialize(self):
1179 if self._downloader is None:
1180 return
1181
1182 username = None
1183 password = None
d0a9affb 1184 downloader_params = self._downloader.params
4fa74b52
RG
1185
1186 # Attempt to use provided username and password or .netrc data
1187 if downloader_params.get('username', None) is not None:
1188 username = downloader_params['username']
1189 password = downloader_params['password']
1190 elif downloader_params.get('usenetrc', False):
1191 try:
1192 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1193 if info is not None:
1194 username = info[0]
1195 password = info[2]
1196 else:
1197 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1198 except (IOError, netrc.NetrcParseError), err:
6f21f686 1199 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
1200 return
1201
72ac78b8 1202 # Set language
1987c232 1203 request = urllib2.Request(self._LANG_URL)
72ac78b8
RG
1204 try:
1205 self.report_lang()
1206 urllib2.urlopen(request).read()
1207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1208 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
1209 return
1210
cc109403
RG
1211 # No authentication to be performed
1212 if username is None:
1213 return
1214
4fa74b52 1215 # Log in
9fcd8355
RG
1216 login_form = {
1217 'current_form': 'loginForm',
4fa74b52
RG
1218 'next': '/',
1219 'action_login': 'Log In',
1220 'username': username,
9fcd8355
RG
1221 'password': password,
1222 }
1987c232 1223 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
4fa74b52 1224 try:
bafa5cd9 1225 self.report_login()
4fa74b52
RG
1226 login_results = urllib2.urlopen(request).read()
1227 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 1228 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
1229 return
1230 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1231 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52 1232 return
d3975459 1233
4fa74b52 1234 # Confirm age
9fcd8355
RG
1235 age_form = {
1236 'next_url': '/',
1237 'action_confirm': 'Confirm',
1238 }
1987c232 1239 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
4fa74b52 1240 try:
bafa5cd9 1241 self.report_age_confirmation()
4fa74b52
RG
1242 age_results = urllib2.urlopen(request).read()
1243 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1244 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 1245 return
4fa74b52
RG
1246
1247 def _real_extract(self, url):
1248 # Extract video id from URL
020f7150 1249 mobj = re.match(self._VALID_URL, url)
4fa74b52 1250 if mobj is None:
147753eb 1251 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1252 return
4fa74b52
RG
1253 video_id = mobj.group(2)
1254
497cd3e6
RG
1255 # Get video webpage
1256 self.report_video_webpage_download(video_id)
8d89fbae 1257 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
497cd3e6
RG
1258 try:
1259 video_webpage = urllib2.urlopen(request).read()
1260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1261 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1262 return
968aa884 1263
497cd3e6 1264 # Attempt to extract SWF player URL
b620a5f8 1265 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
497cd3e6 1266 if mobj is not None:
b620a5f8 1267 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
497cd3e6
RG
1268 else:
1269 player_url = None
1270
1271 # Get video info
1272 self.report_video_info_webpage_download(video_id)
1273 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1274 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c0a10ca8 1275 % (video_id, el_type))
1987c232 1276 request = urllib2.Request(video_info_url)
e616ec0c 1277 try:
497cd3e6
RG
1278 video_info_webpage = urllib2.urlopen(request).read()
1279 video_info = parse_qs(video_info_webpage)
1280 if 'token' in video_info:
1281 break
e616ec0c 1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 1283 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 1284 return
f95f29fd
RG
1285 if 'token' not in video_info:
1286 if 'reason' in video_info:
8e686771 1287 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
1288 else:
1289 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1290 return
1291
1292 # Start extracting information
497cd3e6
RG
1293 self.report_information_extraction(video_id)
1294
1295 # uploader
1296 if 'author' not in video_info:
1297 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1298 return
1299 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 1300
497cd3e6
RG
1301 # title
1302 if 'title' not in video_info:
1303 self._downloader.trouble(u'ERROR: unable to extract video title')
1304 return
1305 video_title = urllib.unquote_plus(video_info['title'][0])
1306 video_title = video_title.decode('utf-8')
1307 video_title = sanitize_title(video_title)
1308
1309 # simplified title
e092418d 1310 simple_title = _simplify_title(video_title)
497cd3e6
RG
1311
1312 # thumbnail image
1313 if 'thumbnail_url' not in video_info:
1314 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1315 video_thumbnail = ''
1316 else: # don't panic if we can't find it
1317 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1318
b3a27b52
NA
1319 # upload date
1320 upload_date = u'NA'
3efa45c3 1321 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
b3a27b52 1322 if mobj is not None:
a1f03c7b 1323 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
87cbd213 1324 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
a1f03c7b
NA
1325 for expression in format_expressions:
1326 try:
1327 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1328 except:
1329 pass
b3a27b52 1330
497cd3e6 1331 # description
c6b55a8d
PH
1332 try:
1333 lxml.etree
1334 except NameError:
1335 video_description = u'No description available.'
8b95c387 1336 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
c6b55a8d
PH
1337 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1338 if mobj is not None:
1339 video_description = mobj.group(1).decode('utf-8')
1340 else:
1341 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1342 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1343 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
91e6a385 1344 # TODO use another parser
497cd3e6 1345
5ce7d172
RG
1346 # token
1347 video_token = urllib.unquote_plus(video_info['token'][0])
1348
497cd3e6 1349 # Decide which formats to download
f83ae781 1350 req_format = self._downloader.params.get('format', None)
2e3a32e4 1351
f137bef9
PH
1352 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1353 self.report_rtmp_download()
1354 video_url_list = [(None, video_info['conn'][0])]
f137bef9 1355 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
0ac22e4f 1356 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
8519c32d 1357 url_data = [parse_qs(uds) for uds in url_data_strs]
f137bef9 1358 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
8519c32d 1359 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
2b70537d 1360
497cd3e6
RG
1361 format_limit = self._downloader.params.get('format_limit', None)
1362 if format_limit is not None and format_limit in self._available_formats:
1363 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 1364 else:
497cd3e6
RG
1365 format_list = self._available_formats
1366 existing_formats = [x for x in format_list if x in url_map]
1367 if len(existing_formats) == 0:
1368 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 1369 return
3de2a1e6
FT
1370 if self._downloader.params.get('listformats', None):
1371 self._print_formats(existing_formats)
2761012f 1372 return
5260e68f 1373 if req_format is None or req_format == 'best':
d157d259 1374 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
c52b01f3
K
1375 elif req_format == 'worst':
1376 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
5260e68f 1377 elif req_format in ('-1', 'all'):
d157d259 1378 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
497cd3e6 1379 else:
5260e68f
PH
1380 # Specific formats. We pick the first in a slash-delimeted sequence.
1381 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1382 req_formats = req_format.split('/')
1383 video_url_list = None
1384 for rf in req_formats:
1385 if rf in url_map:
1386 video_url_list = [(rf, url_map[rf])]
1387 break
1388 if video_url_list is None:
5c132793
RG
1389 self._downloader.trouble(u'ERROR: requested format not available')
1390 return
497cd3e6 1391 else:
f3dc18d8 1392 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
497cd3e6 1393 return
7b7759f5 1394
497cd3e6
RG
1395 for format_param, video_real_url in video_url_list:
1396 # At this point we have a new video
1397 self._downloader.increment_downloads()
1398
1399 # Extension
1400 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 1401
968aa884 1402 try:
7b7759f5 1403 # Process video information
1404 self._downloader.process_info({
1405 'id': video_id.decode('utf-8'),
1406 'url': video_real_url.decode('utf-8'),
1407 'uploader': video_uploader.decode('utf-8'),
138b11f3 1408 'upload_date': upload_date,
7b7759f5 1409 'title': video_title,
1410 'stitle': simple_title,
1411 'ext': video_extension.decode('utf-8'),
6ba562b0 1412 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568 1413 'thumbnail': video_thumbnail.decode('utf-8'),
c6b55a8d 1414 'description': video_description,
e616ec0c 1415 'player_url': player_url,
7b7759f5 1416 })
497cd3e6 1417 except UnavailableVideoError, err:
09cc744c 1418 self._downloader.trouble(u'\nERROR: unable to download video')
42bcd27d 1419
4fa74b52 1420
020f7150
RG
1421class MetacafeIE(InfoExtractor):
1422 """Information Extractor for metacafe.com."""
1423
1424 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 1425 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 1426 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150 1427 _youtube_ie = None
f3098c4d 1428 IE_NAME = u'metacafe'
020f7150
RG
1429
1430 def __init__(self, youtube_ie, downloader=None):
1431 InfoExtractor.__init__(self, downloader)
1432 self._youtube_ie = youtube_ie
1433
020f7150
RG
1434 def report_disclaimer(self):
1435 """Report disclaimer retrieval."""
331ce0a0 1436 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
1437
1438 def report_age_confirmation(self):
1439 """Report attempt to confirm age."""
331ce0a0 1440 self._downloader.to_screen(u'[metacafe] Confirming age')
d3975459 1441
020f7150
RG
1442 def report_download_webpage(self, video_id):
1443 """Report webpage download."""
331ce0a0 1444 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
d3975459 1445
020f7150
RG
1446 def report_extraction(self, video_id):
1447 """Report information extraction."""
331ce0a0 1448 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
1449
1450 def _real_initialize(self):
1451 # Retrieve disclaimer
1987c232 1452 request = urllib2.Request(self._DISCLAIMER)
020f7150
RG
1453 try:
1454 self.report_disclaimer()
1455 disclaimer = urllib2.urlopen(request).read()
1456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1457 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1458 return
1459
1460 # Confirm age
1461 disclaimer_form = {
2546e767 1462 'filters': '0',
020f7150
RG
1463 'submit': "Continue - I'm over 18",
1464 }
1987c232 1465 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
020f7150
RG
1466 try:
1467 self.report_age_confirmation()
1468 disclaimer = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1470 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150 1471 return
d3975459 1472
020f7150
RG
1473 def _real_extract(self, url):
1474 # Extract id and simplified title from URL
1475 mobj = re.match(self._VALID_URL, url)
1476 if mobj is None:
147753eb 1477 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1478 return
020f7150
RG
1479
1480 video_id = mobj.group(1)
1481
1482 # Check if video comes from YouTube
1483 mobj2 = re.match(r'^yt-(.*)$', video_id)
1484 if mobj2 is not None:
6f21f686
RG
1485 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1486 return
020f7150 1487
df372a65 1488 # At this point we have a new video
9bf7fa52 1489 self._downloader.increment_downloads()
df372a65 1490
020f7150 1491 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1492
1493 # Retrieve video webpage to extract further information
1494 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1495 try:
1496 self.report_download_webpage(video_id)
1497 webpage = urllib2.urlopen(request).read()
1498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1499 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1500 return
020f7150
RG
1501
1502 # Extract URL, uploader and title from webpage
1503 self.report_extraction(video_id)
18963a36 1504 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1505 if mobj is not None:
1506 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1507 video_extension = mediaURL[-3:]
d3975459 1508
c6c555cf
RG
1509 # Extract gdaKey if available
1510 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1511 if mobj is None:
1512 video_url = mediaURL
1513 else:
1514 gdaKey = mobj.group(1)
1515 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1516 else:
c6c555cf
RG
1517 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1518 if mobj is None:
1519 self._downloader.trouble(u'ERROR: unable to extract media URL')
1520 return
1521 vardict = parse_qs(mobj.group(1))
1522 if 'mediaData' not in vardict:
1523 self._downloader.trouble(u'ERROR: unable to extract media URL')
1524 return
1525 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1526 if mobj is None:
1527 self._downloader.trouble(u'ERROR: unable to extract media URL')
1528 return
6b57e8c5
RG
1529 mediaURL = mobj.group(1).replace('\\/', '/')
1530 video_extension = mediaURL[-3:]
1531 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1532
2546e767 1533 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1534 if mobj is None:
147753eb 1535 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1536 return
020f7150 1537 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1538 video_title = sanitize_title(video_title)
020f7150 1539
29f07568 1540 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1541 if mobj is None:
147753eb 1542 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1543 return
dbccb6cd 1544 video_uploader = mobj.group(1)
020f7150 1545
42bcd27d 1546 try:
1547 # Process video information
1548 self._downloader.process_info({
1549 'id': video_id.decode('utf-8'),
1550 'url': video_url.decode('utf-8'),
1551 'uploader': video_uploader.decode('utf-8'),
138b11f3 1552 'upload_date': u'NA',
42bcd27d 1553 'title': video_title,
1554 'stitle': simple_title,
1555 'ext': video_extension.decode('utf-8'),
6ba562b0 1556 'format': u'NA',
e616ec0c 1557 'player_url': None,
42bcd27d 1558 })
73f4e7af 1559 except UnavailableVideoError:
09cc744c 1560 self._downloader.trouble(u'\nERROR: unable to download video')
020f7150 1561
25af2bce 1562
4135fa45
WB
1563class DailymotionIE(InfoExtractor):
1564 """Information Extractor for Dailymotion"""
1565
1566 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
f3098c4d 1567 IE_NAME = u'dailymotion'
4135fa45
WB
1568
1569 def __init__(self, downloader=None):
1570 InfoExtractor.__init__(self, downloader)
1571
4135fa45
WB
1572 def report_download_webpage(self, video_id):
1573 """Report webpage download."""
331ce0a0 1574 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
d3975459 1575
4135fa45
WB
1576 def report_extraction(self, video_id):
1577 """Report information extraction."""
331ce0a0 1578 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45 1579
4135fa45
WB
1580 def _real_extract(self, url):
1581 # Extract id and simplified title from URL
1582 mobj = re.match(self._VALID_URL, url)
1583 if mobj is None:
1584 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1585 return
1586
df372a65 1587 # At this point we have a new video
9bf7fa52 1588 self._downloader.increment_downloads()
4135fa45
WB
1589 video_id = mobj.group(1)
1590
1591 simple_title = mobj.group(2).decode('utf-8')
1592 video_extension = 'flv'
1593
1594 # Retrieve video webpage to extract further information
1595 request = urllib2.Request(url)
62a29bbf 1596 request.add_header('Cookie', 'family_filter=off')
4135fa45
WB
1597 try:
1598 self.report_download_webpage(video_id)
1599 webpage = urllib2.urlopen(request).read()
1600 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1602 return
1603
1604 # Extract URL, uploader and title from webpage
1605 self.report_extraction(video_id)
62a29bbf 1606 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
4135fa45
WB
1607 if mobj is None:
1608 self._downloader.trouble(u'ERROR: unable to extract media URL')
1609 return
62a29bbf 1610 sequence = urllib.unquote(mobj.group(1))
1611 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1612 if mobj is None:
1613 self._downloader.trouble(u'ERROR: unable to extract media URL')
1614 return
1615 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
4135fa45
WB
1616
1617 # if needed add http://www.dailymotion.com/ if relative URL
1618
1619 video_url = mediaURL
1620
62a29bbf 1621 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
4135fa45
WB
1622 if mobj is None:
1623 self._downloader.trouble(u'ERROR: unable to extract title')
1624 return
1625 video_title = mobj.group(1).decode('utf-8')
1626 video_title = sanitize_title(video_title)
1627
62a29bbf 1628 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
4135fa45
WB
1629 if mobj is None:
1630 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1631 return
1632 video_uploader = mobj.group(1)
1633
1634 try:
1635 # Process video information
1636 self._downloader.process_info({
1637 'id': video_id.decode('utf-8'),
1638 'url': video_url.decode('utf-8'),
1639 'uploader': video_uploader.decode('utf-8'),
138b11f3 1640 'upload_date': u'NA',
4135fa45
WB
1641 'title': video_title,
1642 'stitle': simple_title,
1643 'ext': video_extension.decode('utf-8'),
1644 'format': u'NA',
1645 'player_url': None,
1646 })
73f4e7af 1647 except UnavailableVideoError:
09cc744c 1648 self._downloader.trouble(u'\nERROR: unable to download video')
4135fa45 1649
c0a10ca8 1650
49c0028a 1651class GoogleIE(InfoExtractor):
1652 """Information extractor for video.google.com."""
1653
490fd7ae 1654 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
f3098c4d 1655 IE_NAME = u'video.google'
49c0028a 1656
1657 def __init__(self, downloader=None):
1658 InfoExtractor.__init__(self, downloader)
1659
49c0028a 1660 def report_download_webpage(self, video_id):
1661 """Report webpage download."""
331ce0a0 1662 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1663
1664 def report_extraction(self, video_id):
1665 """Report information extraction."""
331ce0a0 1666 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1667
49c0028a 1668 def _real_extract(self, url):
1669 # Extract id from URL
1670 mobj = re.match(self._VALID_URL, url)
1671 if mobj is None:
1672 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1673 return
1674
df372a65 1675 # At this point we have a new video
9bf7fa52 1676 self._downloader.increment_downloads()
49c0028a 1677 video_id = mobj.group(1)
1678
1679 video_extension = 'mp4'
1680
1681 # Retrieve video webpage to extract further information
490fd7ae 1682 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1683 try:
1684 self.report_download_webpage(video_id)
1685 webpage = urllib2.urlopen(request).read()
1686 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1687 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1688 return
1689
1690 # Extract URL, uploader, and title from webpage
1691 self.report_extraction(video_id)
490fd7ae
RG
1692 mobj = re.search(r"download_url:'([^']+)'", webpage)
1693 if mobj is None:
1694 video_extension = 'flv'
1695 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1696 if mobj is None:
1697 self._downloader.trouble(u'ERROR: unable to extract media URL')
1698 return
1699 mediaURL = urllib.unquote(mobj.group(1))
1700 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1701 mediaURL = mediaURL.replace('\\x26', '\x26')
1702
1703 video_url = mediaURL
1704
1705 mobj = re.search(r'<title>(.*)</title>', webpage)
1706 if mobj is None:
1707 self._downloader.trouble(u'ERROR: unable to extract title')
1708 return
1709 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1710 video_title = sanitize_title(video_title)
e092418d 1711 simple_title = _simplify_title(video_title)
49c0028a 1712
7e58d568
RG
1713 # Extract video description
1714 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1715 if mobj is None:
1716 self._downloader.trouble(u'ERROR: unable to extract video description')
1717 return
1718 video_description = mobj.group(1).decode('utf-8')
1719 if not video_description:
1720 video_description = 'No description available.'
1721
1722 # Extract video thumbnail
1723 if self._downloader.params.get('forcethumbnail', False):
1724 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1725 try:
1726 webpage = urllib2.urlopen(request).read()
1727 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1729 return
1730 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1731 if mobj is None:
1732 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1733 return
1734 video_thumbnail = mobj.group(1)
1735 else: # we need something to pass to process_info
1736 video_thumbnail = ''
1737
49c0028a 1738 try:
1739 # Process video information
1740 self._downloader.process_info({
1741 'id': video_id.decode('utf-8'),
1742 'url': video_url.decode('utf-8'),
6ba562b0 1743 'uploader': u'NA',
138b11f3 1744 'upload_date': u'NA',
490fd7ae 1745 'title': video_title,
31cbdaaf 1746 'stitle': simple_title,
49c0028a 1747 'ext': video_extension.decode('utf-8'),
6ba562b0 1748 'format': u'NA',
e616ec0c 1749 'player_url': None,
49c0028a 1750 })
73f4e7af 1751 except UnavailableVideoError:
09cc744c 1752 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1753
1754
1755class PhotobucketIE(InfoExtractor):
1756 """Information extractor for photobucket.com."""
1757
1758 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
f3098c4d 1759 IE_NAME = u'photobucket'
49c0028a 1760
1761 def __init__(self, downloader=None):
1762 InfoExtractor.__init__(self, downloader)
1763
49c0028a 1764 def report_download_webpage(self, video_id):
1765 """Report webpage download."""
331ce0a0 1766 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1767
1768 def report_extraction(self, video_id):
1769 """Report information extraction."""
331ce0a0 1770 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1771
49c0028a 1772 def _real_extract(self, url):
1773 # Extract id from URL
1774 mobj = re.match(self._VALID_URL, url)
1775 if mobj is None:
1776 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1777 return
1778
df372a65 1779 # At this point we have a new video
9bf7fa52 1780 self._downloader.increment_downloads()
49c0028a 1781 video_id = mobj.group(1)
1782
1783 video_extension = 'flv'
1784
1785 # Retrieve video webpage to extract further information
1786 request = urllib2.Request(url)
1787 try:
1788 self.report_download_webpage(video_id)
1789 webpage = urllib2.urlopen(request).read()
1790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1792 return
1793
1794 # Extract URL, uploader, and title from webpage
1795 self.report_extraction(video_id)
1796 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1797 if mobj is None:
1798 self._downloader.trouble(u'ERROR: unable to extract media URL')
1799 return
1800 mediaURL = urllib.unquote(mobj.group(1))
1801
1802 video_url = mediaURL
1803
1804 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1805 if mobj is None:
1806 self._downloader.trouble(u'ERROR: unable to extract title')
1807 return
1808 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1809 video_title = sanitize_title(video_title)
e092418d 1810 simple_title = _simplify_title(vide_title)
49c0028a 1811
1812 video_uploader = mobj.group(2).decode('utf-8')
1813
1814 try:
1815 # Process video information
1816 self._downloader.process_info({
1817 'id': video_id.decode('utf-8'),
1818 'url': video_url.decode('utf-8'),
490fd7ae 1819 'uploader': video_uploader,
138b11f3 1820 'upload_date': u'NA',
490fd7ae 1821 'title': video_title,
31cbdaaf 1822 'stitle': simple_title,
490fd7ae 1823 'ext': video_extension.decode('utf-8'),
6ba562b0 1824 'format': u'NA',
e616ec0c 1825 'player_url': None,
490fd7ae 1826 })
73f4e7af 1827 except UnavailableVideoError:
09cc744c 1828 self._downloader.trouble(u'\nERROR: unable to download video')
490fd7ae
RG
1829
1830
61945318
RG
1831class YahooIE(InfoExtractor):
1832 """Information extractor for video.yahoo.com."""
1833
1834 # _VALID_URL matches all Yahoo! Video URLs
1835 # _VPAGE_URL matches only the extractable '/watch/' URLs
1836 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1837 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
f3098c4d 1838 IE_NAME = u'video.yahoo'
61945318
RG
1839
1840 def __init__(self, downloader=None):
1841 InfoExtractor.__init__(self, downloader)
1842
61945318
RG
1843 def report_download_webpage(self, video_id):
1844 """Report webpage download."""
331ce0a0 1845 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1846
1847 def report_extraction(self, video_id):
1848 """Report information extraction."""
331ce0a0 1849 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318 1850
df372a65 1851 def _real_extract(self, url, new_video=True):
61945318
RG
1852 # Extract ID from URL
1853 mobj = re.match(self._VALID_URL, url)
1854 if mobj is None:
1855 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1856 return
1857
df372a65 1858 # At this point we have a new video
9bf7fa52 1859 self._downloader.increment_downloads()
61945318
RG
1860 video_id = mobj.group(2)
1861 video_extension = 'flv'
1862
1863 # Rewrite valid but non-extractable URLs as
1864 # extractable English language /watch/ URLs
1865 if re.match(self._VPAGE_URL, url) is None:
1866 request = urllib2.Request(url)
1867 try:
1868 webpage = urllib2.urlopen(request).read()
1869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1871 return
1872
1873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1874 if mobj is None:
1875 self._downloader.trouble(u'ERROR: Unable to extract id field')
1876 return
1877 yahoo_id = mobj.group(1)
1878
1879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1880 if mobj is None:
1881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1882 return
1883 yahoo_vid = mobj.group(1)
1884
1885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1886 return self._real_extract(url, new_video=False)
61945318
RG
1887
1888 # Retrieve video webpage to extract further information
1889 request = urllib2.Request(url)
1890 try:
1891 self.report_download_webpage(video_id)
1892 webpage = urllib2.urlopen(request).read()
1893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1895 return
1896
1897 # Extract uploader and title from webpage
1898 self.report_extraction(video_id)
1899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1900 if mobj is None:
1901 self._downloader.trouble(u'ERROR: unable to extract video title')
1902 return
1903 video_title = mobj.group(1).decode('utf-8')
e092418d 1904 simple_title = _simplify_title(video_title)
61945318
RG
1905
1906 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1907 if mobj is None:
1908 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1909 return
1910 video_uploader = mobj.group(1).decode('utf-8')
1911
7e58d568
RG
1912 # Extract video thumbnail
1913 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1916 return
1917 video_thumbnail = mobj.group(1).decode('utf-8')
1918
1919 # Extract video description
1920 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1921 if mobj is None:
1922 self._downloader.trouble(u'ERROR: unable to extract video description')
1923 return
1924 video_description = mobj.group(1).decode('utf-8')
c0a10ca8
F
1925 if not video_description:
1926 video_description = 'No description available.'
7e58d568 1927
61945318
RG
1928 # Extract video height and width
1929 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1930 if mobj is None:
1931 self._downloader.trouble(u'ERROR: unable to extract video height')
1932 return
1933 yv_video_height = mobj.group(1)
1934
1935 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1936 if mobj is None:
1937 self._downloader.trouble(u'ERROR: unable to extract video width')
1938 return
1939 yv_video_width = mobj.group(1)
1940
1941 # Retrieve video playlist to extract media URL
1942 # I'm not completely sure what all these options are, but we
1943 # seem to need most of them, otherwise the server sends a 401.
1944 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1945 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1946 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
c0a10ca8
F
1947 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1948 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
61945318
RG
1949 try:
1950 self.report_download_webpage(video_id)
1951 webpage = urllib2.urlopen(request).read()
1952 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1953 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1954 return
1955
1956 # Extract media URL from playlist XML
1957 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1958 if mobj is None:
1959 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1960 return
1961 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1962 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1963
1964 try:
1965 # Process video information
1966 self._downloader.process_info({
1967 'id': video_id.decode('utf-8'),
1968 'url': video_url,
1969 'uploader': video_uploader,
138b11f3 1970 'upload_date': u'NA',
61945318
RG
1971 'title': video_title,
1972 'stitle': simple_title,
1973 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1974 'thumbnail': video_thumbnail.decode('utf-8'),
1975 'description': video_description,
1976 'thumbnail': video_thumbnail,
e616ec0c 1977 'player_url': None,
61945318 1978 })
73f4e7af 1979 except UnavailableVideoError:
09cc744c 1980 self._downloader.trouble(u'\nERROR: unable to download video')
61945318
RG
1981
1982
92743d42
RB
1983class VimeoIE(InfoExtractor):
1984 """Information extractor for vimeo.com."""
1985
1986 # _VALID_URL matches Vimeo URLs
44c636df 1987 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
f3098c4d 1988 IE_NAME = u'vimeo'
92743d42
RB
1989
1990 def __init__(self, downloader=None):
1991 InfoExtractor.__init__(self, downloader)
1992
92743d42
RB
1993 def report_download_webpage(self, video_id):
1994 """Report webpage download."""
0ecedbdb 1995 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
92743d42
RB
1996
1997 def report_extraction(self, video_id):
1998 """Report information extraction."""
0ecedbdb 1999 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
92743d42 2000
92743d42
RB
2001 def _real_extract(self, url, new_video=True):
2002 # Extract ID from URL
2003 mobj = re.match(self._VALID_URL, url)
2004 if mobj is None:
2005 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2006 return
2007
2008 # At this point we have a new video
2009 self._downloader.increment_downloads()
2010 video_id = mobj.group(1)
92743d42
RB
2011
2012 # Retrieve video webpage to extract further information
2013 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2014 try:
2015 self.report_download_webpage(video_id)
2016 webpage = urllib2.urlopen(request).read()
2017 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019 return
2020
f24c674b
RB
2021 # Now we begin extracting as much information as we can from what we
2022 # retrieved. First we extract the information common to all extractors,
2023 # and latter we extract those that are Vimeo specific.
92743d42 2024 self.report_extraction(video_id)
f24c674b
RB
2025
2026 # Extract title
c5a088d3 2027 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
92743d42
RB
2028 if mobj is None:
2029 self._downloader.trouble(u'ERROR: unable to extract video title')
2030 return
2031 video_title = mobj.group(1).decode('utf-8')
28e3614b 2032 simple_title = _simplify_title(video_title)
92743d42 2033
f24c674b 2034 # Extract uploader
c5a088d3 2035 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
92743d42
RB
2036 if mobj is None:
2037 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038 return
2039 video_uploader = mobj.group(1).decode('utf-8')
2040
2041 # Extract video thumbnail
c5a088d3 2042 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
92743d42
RB
2043 if mobj is None:
2044 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045 return
2046 video_thumbnail = mobj.group(1).decode('utf-8')
2047
2048 # # Extract video description
2049 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2050 # if mobj is None:
2051 # self._downloader.trouble(u'ERROR: unable to extract video description')
2052 # return
2053 # video_description = mobj.group(1).decode('utf-8')
2054 # if not video_description: video_description = 'No description available.'
2055 video_description = 'Foo.'
2056
f24c674b 2057 # Vimeo specific: extract request signature
c5a088d3 2058 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
92743d42
RB
2059 if mobj is None:
2060 self._downloader.trouble(u'ERROR: unable to extract request signature')
2061 return
2062 sig = mobj.group(1).decode('utf-8')
2063
c424df0d
RB
2064 # Vimeo specific: extract video quality information
2065 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2066 if mobj is None:
2067 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2068 return
2069 quality = mobj.group(1).decode('utf-8')
2070
2071 if int(quality) == 1:
2072 quality = 'hd'
2073 else:
2074 quality = 'sd'
2075
f24c674b 2076 # Vimeo specific: Extract request signature expiration
c5a088d3 2077 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
92743d42
RB
2078 if mobj is None:
2079 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2080 return
2081 sig_exp = mobj.group(1).decode('utf-8')
2082
c424df0d 2083 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
92743d42
RB
2084
2085 try:
2086 # Process video information
2087 self._downloader.process_info({
2088 'id': video_id.decode('utf-8'),
2089 'url': video_url,
2090 'uploader': video_uploader,
2091 'upload_date': u'NA',
2092 'title': video_title,
2093 'stitle': simple_title,
2fc31a48 2094 'ext': u'mp4',
92743d42
RB
2095 'thumbnail': video_thumbnail.decode('utf-8'),
2096 'description': video_description,
2097 'thumbnail': video_thumbnail,
2098 'description': video_description,
2099 'player_url': None,
2100 })
2101 except UnavailableVideoError:
2102 self._downloader.trouble(u'ERROR: unable to download video')
2103
2104
490fd7ae
RG
2105class GenericIE(InfoExtractor):
2106 """Generic last-resort information extractor."""
2107
f3098c4d
PH
2108 _VALID_URL = r'.*'
2109 IE_NAME = u'generic'
bdb3f7a7 2110
490fd7ae
RG
2111 def __init__(self, downloader=None):
2112 InfoExtractor.__init__(self, downloader)
2113
490fd7ae
RG
2114 def report_download_webpage(self, video_id):
2115 """Report webpage download."""
331ce0a0
RG
2116 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2117 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
2118
2119 def report_extraction(self, video_id):
2120 """Report information extraction."""
331ce0a0 2121 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae 2122
490fd7ae 2123 def _real_extract(self, url):
df372a65 2124 # At this point we have a new video
9bf7fa52 2125 self._downloader.increment_downloads()
df372a65 2126
490fd7ae
RG
2127 video_id = url.split('/')[-1]
2128 request = urllib2.Request(url)
2129 try:
2130 self.report_download_webpage(video_id)
2131 webpage = urllib2.urlopen(request).read()
2132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2134 return
2135 except ValueError, err:
2136 # since this is the last-resort InfoExtractor, if
2137 # this error is thrown, it'll be thrown here
2138 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2139 return
2140
a9806fd8 2141 self.report_extraction(video_id)
490fd7ae
RG
2142 # Start with something easy: JW Player in SWFObject
2143 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2144 if mobj is None:
2145 # Broaden the search a little bit
2146 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2147 if mobj is None:
2148 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2149 return
2150
2151 # It's possible that one of the regexes
2152 # matched, but returned an empty group:
2153 if mobj.group(1) is None:
2154 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2155 return
2156
2157 video_url = urllib.unquote(mobj.group(1))
c0a10ca8 2158 video_id = os.path.basename(video_url)
490fd7ae
RG
2159
2160 # here's a fun little line of code for you:
2161 video_extension = os.path.splitext(video_id)[1][1:]
c0a10ca8 2162 video_id = os.path.splitext(video_id)[0]
490fd7ae
RG
2163
2164 # it's tempting to parse this further, but you would
2165 # have to take into account all the variations like
2166 # Video Title - Site Name
2167 # Site Name | Video Title
2168 # Video Title - Tagline | Site Name
2169 # and so on and so forth; it's just not practical
2170 mobj = re.search(r'<title>(.*)</title>', webpage)
2171 if mobj is None:
2172 self._downloader.trouble(u'ERROR: unable to extract title')
2173 return
2174 video_title = mobj.group(1).decode('utf-8')
2175 video_title = sanitize_title(video_title)
e092418d 2176 simple_title = _simplify_title(video_title)
490fd7ae
RG
2177
2178 # video uploader is domain name
2179 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2180 if mobj is None:
2181 self._downloader.trouble(u'ERROR: unable to extract title')
2182 return
2183 video_uploader = mobj.group(1).decode('utf-8')
2184
2185 try:
2186 # Process video information
2187 self._downloader.process_info({
2188 'id': video_id.decode('utf-8'),
2189 'url': video_url.decode('utf-8'),
2190 'uploader': video_uploader,
138b11f3 2191 'upload_date': u'NA',
490fd7ae 2192 'title': video_title,
31cbdaaf 2193 'stitle': simple_title,
49c0028a 2194 'ext': video_extension.decode('utf-8'),
6ba562b0 2195 'format': u'NA',
e616ec0c 2196 'player_url': None,
49c0028a 2197 })
73f4e7af 2198 except UnavailableVideoError, err:
09cc744c 2199 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 2200
2201
25af2bce
RG
2202class YoutubeSearchIE(InfoExtractor):
2203 """Information Extractor for YouTube search queries."""
bdb3f7a7 2204 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
25af2bce
RG
2205 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2206 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 2207 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 2208 _youtube_ie = None
fd9288c3 2209 _max_youtube_results = 1000
f3098c4d 2210 IE_NAME = u'youtube:search'
25af2bce 2211
f995f712 2212 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
2213 InfoExtractor.__init__(self, downloader)
2214 self._youtube_ie = youtube_ie
d3975459 2215
25af2bce
RG
2216 def report_download_page(self, query, pagenum):
2217 """Report attempt to download playlist page with given number."""
490fd7ae 2218 query = query.decode(preferredencoding())
331ce0a0 2219 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
2220
2221 def _real_initialize(self):
2222 self._youtube_ie.initialize()
d3975459 2223
25af2bce 2224 def _real_extract(self, query):
bdb3f7a7 2225 mobj = re.match(self._VALID_URL, query)
25af2bce 2226 if mobj is None:
147753eb 2227 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 2228 return
25af2bce
RG
2229
2230 prefix, query = query.split(':')
2231 prefix = prefix[8:]
c0a10ca8 2232 query = query.encode('utf-8')
f995f712 2233 if prefix == '':
6f21f686
RG
2234 self._download_n_results(query, 1)
2235 return
f995f712 2236 elif prefix == 'all':
6f21f686
RG
2237 self._download_n_results(query, self._max_youtube_results)
2238 return
f995f712 2239 else:
25af2bce 2240 try:
e1f18b8a 2241 n = long(prefix)
25af2bce 2242 if n <= 0:
147753eb 2243 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 2244 return
257453b9 2245 elif n > self._max_youtube_results:
c0a10ca8 2246 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 2247 n = self._max_youtube_results
6f21f686
RG
2248 self._download_n_results(query, n)
2249 return
e1f18b8a 2250 except ValueError: # parsing prefix as integer fails
6f21f686
RG
2251 self._download_n_results(query, 1)
2252 return
25af2bce
RG
2253
2254 def _download_n_results(self, query, n):
2255 """Downloads a specified number of results for a query"""
2256
2257 video_ids = []
2258 already_seen = set()
2259 pagenum = 1
2260
2261 while True:
2262 self.report_download_page(query, pagenum)
a9633f14 2263 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2264 request = urllib2.Request(result_url)
25af2bce
RG
2265 try:
2266 page = urllib2.urlopen(request).read()
2267 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2268 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2269 return
25af2bce
RG
2270
2271 # Extract video identifiers
2272 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2273 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2274 if video_id not in already_seen:
2275 video_ids.append(video_id)
2276 already_seen.add(video_id)
2277 if len(video_ids) == n:
2278 # Specified n videos reached
25af2bce 2279 for id in video_ids:
6f21f686
RG
2280 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2281 return
25af2bce 2282
304a4d85 2283 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 2284 for id in video_ids:
6f21f686
RG
2285 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2286 return
25af2bce
RG
2287
2288 pagenum = pagenum + 1
2289
c0a10ca8 2290
7e58d568
RG
2291class GoogleSearchIE(InfoExtractor):
2292 """Information Extractor for Google Video search queries."""
bdb3f7a7 2293 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
7e58d568
RG
2294 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2295 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2296 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2297 _google_ie = None
2298 _max_google_results = 1000
f3098c4d 2299 IE_NAME = u'video.google:search'
7e58d568
RG
2300
2301 def __init__(self, google_ie, downloader=None):
2302 InfoExtractor.__init__(self, downloader)
2303 self._google_ie = google_ie
d3975459 2304
7e58d568
RG
2305 def report_download_page(self, query, pagenum):
2306 """Report attempt to download playlist page with given number."""
2307 query = query.decode(preferredencoding())
331ce0a0 2308 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2309
2310 def _real_initialize(self):
2311 self._google_ie.initialize()
d3975459 2312
7e58d568 2313 def _real_extract(self, query):
bdb3f7a7 2314 mobj = re.match(self._VALID_URL, query)
7e58d568
RG
2315 if mobj is None:
2316 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2317 return
2318
2319 prefix, query = query.split(':')
2320 prefix = prefix[8:]
c0a10ca8 2321 query = query.encode('utf-8')
7e58d568
RG
2322 if prefix == '':
2323 self._download_n_results(query, 1)
2324 return
2325 elif prefix == 'all':
2326 self._download_n_results(query, self._max_google_results)
2327 return
2328 else:
2329 try:
2330 n = long(prefix)
2331 if n <= 0:
2332 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2333 return
2334 elif n > self._max_google_results:
c0a10ca8 2335 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
7e58d568
RG
2336 n = self._max_google_results
2337 self._download_n_results(query, n)
2338 return
2339 except ValueError: # parsing prefix as integer fails
2340 self._download_n_results(query, 1)
2341 return
2342
2343 def _download_n_results(self, query, n):
2344 """Downloads a specified number of results for a query"""
2345
2346 video_ids = []
2347 already_seen = set()
2348 pagenum = 1
2349
2350 while True:
2351 self.report_download_page(query, pagenum)
2352 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2353 request = urllib2.Request(result_url)
7e58d568
RG
2354 try:
2355 page = urllib2.urlopen(request).read()
2356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2357 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2358 return
2359
2360 # Extract video identifiers
2361 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2362 video_id = mobj.group(1)
2363 if video_id not in already_seen:
2364 video_ids.append(video_id)
2365 already_seen.add(video_id)
2366 if len(video_ids) == n:
2367 # Specified n videos reached
2368 for id in video_ids:
2369 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2370 return
2371
2372 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2373 for id in video_ids:
2374 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2375 return
2376
2377 pagenum = pagenum + 1
2378
c0a10ca8 2379
7e58d568
RG
2380class YahooSearchIE(InfoExtractor):
2381 """Information Extractor for Yahoo! Video search queries."""
bdb3f7a7 2382 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
7e58d568
RG
2383 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2384 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2385 _MORE_PAGES_INDICATOR = r'\s*Next'
2386 _yahoo_ie = None
2387 _max_yahoo_results = 1000
f3098c4d 2388 IE_NAME = u'video.yahoo:search'
7e58d568
RG
2389
2390 def __init__(self, yahoo_ie, downloader=None):
2391 InfoExtractor.__init__(self, downloader)
2392 self._yahoo_ie = yahoo_ie
d3975459 2393
7e58d568
RG
2394 def report_download_page(self, query, pagenum):
2395 """Report attempt to download playlist page with given number."""
2396 query = query.decode(preferredencoding())
331ce0a0 2397 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2398
2399 def _real_initialize(self):
2400 self._yahoo_ie.initialize()
d3975459 2401
7e58d568 2402 def _real_extract(self, query):
bdb3f7a7 2403 mobj = re.match(self._VALID_URL, query)
7e58d568
RG
2404 if mobj is None:
2405 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2406 return
2407
2408 prefix, query = query.split(':')
2409 prefix = prefix[8:]
c0a10ca8 2410 query = query.encode('utf-8')
7e58d568
RG
2411 if prefix == '':
2412 self._download_n_results(query, 1)
2413 return
2414 elif prefix == 'all':
2415 self._download_n_results(query, self._max_yahoo_results)
2416 return
2417 else:
2418 try:
2419 n = long(prefix)
2420 if n <= 0:
2421 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2422 return
2423 elif n > self._max_yahoo_results:
c0a10ca8 2424 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
7e58d568
RG
2425 n = self._max_yahoo_results
2426 self._download_n_results(query, n)
2427 return
2428 except ValueError: # parsing prefix as integer fails
2429 self._download_n_results(query, 1)
2430 return
2431
2432 def _download_n_results(self, query, n):
2433 """Downloads a specified number of results for a query"""
2434
2435 video_ids = []
2436 already_seen = set()
2437 pagenum = 1
2438
2439 while True:
2440 self.report_download_page(query, pagenum)
2441 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2442 request = urllib2.Request(result_url)
7e58d568
RG
2443 try:
2444 page = urllib2.urlopen(request).read()
2445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2446 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2447 return
2448
2449 # Extract video identifiers
2450 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2451 video_id = mobj.group(1)
2452 if video_id not in already_seen:
2453 video_ids.append(video_id)
2454 already_seen.add(video_id)
2455 if len(video_ids) == n:
2456 # Specified n videos reached
2457 for id in video_ids:
2458 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2459 return
2460
2461 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2462 for id in video_ids:
2463 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2464 return
2465
2466 pagenum = pagenum + 1
2467
c0a10ca8 2468
0c2dc87d
RG
2469class YoutubePlaylistIE(InfoExtractor):
2470 """Information Extractor for YouTube playlists."""
2471
c3e4e7c1 2472 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
f74e22ae 2473 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
0c2dc87d 2474 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 2475 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d 2476 _youtube_ie = None
f3098c4d 2477 IE_NAME = u'youtube:playlist'
0c2dc87d
RG
2478
2479 def __init__(self, youtube_ie, downloader=None):
2480 InfoExtractor.__init__(self, downloader)
2481 self._youtube_ie = youtube_ie
d3975459 2482
0c2dc87d
RG
2483 def report_download_page(self, playlist_id, pagenum):
2484 """Report attempt to download playlist page with given number."""
331ce0a0 2485 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
2486
2487 def _real_initialize(self):
2488 self._youtube_ie.initialize()
d3975459 2489
0c2dc87d
RG
2490 def _real_extract(self, url):
2491 # Extract playlist id
2492 mobj = re.match(self._VALID_URL, url)
2493 if mobj is None:
147753eb 2494 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 2495 return
0c2dc87d 2496
d119b54d
RG
2497 # Single video case
2498 if mobj.group(3) is not None:
2499 self._youtube_ie.extract(mobj.group(3))
2500 return
2501
0c2dc87d 2502 # Download playlist pages
f74e22ae
GI
2503 # prefix is 'p' as default for playlists but there are other types that need extra care
2504 playlist_prefix = mobj.group(1)
2505 if playlist_prefix == 'a':
2506 playlist_access = 'artist'
2507 else:
7cc3c6fd 2508 playlist_prefix = 'p'
f74e22ae
GI
2509 playlist_access = 'view_play_list'
2510 playlist_id = mobj.group(2)
0c2dc87d
RG
2511 video_ids = []
2512 pagenum = 1
2513
2514 while True:
2515 self.report_download_page(playlist_id, pagenum)
c3e4e7c1
PH
2516 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2517 request = urllib2.Request(url)
0c2dc87d
RG
2518 try:
2519 page = urllib2.urlopen(request).read()
2520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2521 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2522 return
0c2dc87d
RG
2523
2524 # Extract video identifiers
27d98b6e 2525 ids_in_page = []
0c2dc87d 2526 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
2527 if mobj.group(1) not in ids_in_page:
2528 ids_in_page.append(mobj.group(1))
2529 video_ids.extend(ids_in_page)
0c2dc87d 2530
ce5cafea 2531 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
2532 break
2533 pagenum = pagenum + 1
2534
8cc44341
RG
2535 playliststart = self._downloader.params.get('playliststart', 1) - 1
2536 playlistend = self._downloader.params.get('playlistend', -1)
2537 video_ids = video_ids[playliststart:playlistend]
2538
0c2dc87d 2539 for id in video_ids:
6f21f686
RG
2540 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2541 return
0c2dc87d 2542
c0a10ca8 2543
c39c05cd
A
2544class YoutubeUserIE(InfoExtractor):
2545 """Information Extractor for YouTube users."""
2546
b845d58b 2547 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
c39c05cd 2548 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
5aba6ea4
RG
2549 _GDATA_PAGE_SIZE = 50
2550 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
abeac45a 2551 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
c39c05cd 2552 _youtube_ie = None
f3098c4d 2553 IE_NAME = u'youtube:user'
c39c05cd
A
2554
2555 def __init__(self, youtube_ie, downloader=None):
2556 InfoExtractor.__init__(self, downloader)
2557 self._youtube_ie = youtube_ie
d3975459 2558
5aba6ea4 2559 def report_download_page(self, username, start_index):
c39c05cd 2560 """Report attempt to download user page."""
5aba6ea4 2561 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
c0a10ca8 2562 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
c39c05cd
A
2563
2564 def _real_initialize(self):
2565 self._youtube_ie.initialize()
d3975459 2566
c39c05cd
A
2567 def _real_extract(self, url):
2568 # Extract username
2569 mobj = re.match(self._VALID_URL, url)
2570 if mobj is None:
2571 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2572 return
2573
c39c05cd 2574 username = mobj.group(1)
5aba6ea4
RG
2575
2576 # Download video ids using YouTube Data API. Result size per
2577 # query is limited (currently to 50 videos) so we need to query
2578 # page by page until there are no video ids - it means we got
2579 # all of them.
2580
c39c05cd 2581 video_ids = []
5aba6ea4 2582 pagenum = 0
c39c05cd 2583
5aba6ea4
RG
2584 while True:
2585 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2586 self.report_download_page(username, start_index)
c39c05cd 2587
5aba6ea4 2588 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
c39c05cd 2589
5aba6ea4
RG
2590 try:
2591 page = urllib2.urlopen(request).read()
2592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2594 return
c39c05cd 2595
5aba6ea4
RG
2596 # Extract video identifiers
2597 ids_in_page = []
2598
2599 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2600 if mobj.group(1) not in ids_in_page:
2601 ids_in_page.append(mobj.group(1))
2602
2603 video_ids.extend(ids_in_page)
2604
2605 # A little optimization - if current page is not
2606 # "full", ie. does not contain PAGE_SIZE video ids then
2607 # we can assume that this page is the last one - there
2608 # are no more ids on further pages - no need to query
2609 # again.
2610
2611 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2612 break
2613
2614 pagenum += 1
2615
2616 all_ids_count = len(video_ids)
8cc44341
RG
2617 playliststart = self._downloader.params.get('playliststart', 1) - 1
2618 playlistend = self._downloader.params.get('playlistend', -1)
204c9398 2619
5aba6ea4
RG
2620 if playlistend == -1:
2621 video_ids = video_ids[playliststart:]
2622 else:
2623 video_ids = video_ids[playliststart:playlistend]
7a9054ec 2624
5aba6ea4 2625 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
c0a10ca8 2626 (username, all_ids_count, len(video_ids)))
5aba6ea4
RG
2627
2628 for video_id in video_ids:
2629 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2630
c39c05cd 2631
27179cfd
VV
2632class DepositFilesIE(InfoExtractor):
2633 """Information extractor for depositfiles.com"""
2634
b845d58b 2635 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
f3098c4d 2636 IE_NAME = u'DepositFiles'
27179cfd
VV
2637
2638 def __init__(self, downloader=None):
2639 InfoExtractor.__init__(self, downloader)
2640
27179cfd
VV
2641 def report_download_webpage(self, file_id):
2642 """Report webpage download."""
2643 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2644
2645 def report_extraction(self, file_id):
2646 """Report information extraction."""
2647 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2648
27179cfd
VV
2649 def _real_extract(self, url):
2650 # At this point we have a new file
2651 self._downloader.increment_downloads()
2652
2653 file_id = url.split('/')[-1]
2654 # Rebuild url in english locale
2655 url = 'http://depositfiles.com/en/files/' + file_id
2656
2657 # Retrieve file webpage with 'Free download' button pressed
2658 free_download_indication = { 'gateway_result' : '1' }
1987c232 2659 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
27179cfd
VV
2660 try:
2661 self.report_download_webpage(file_id)
2662 webpage = urllib2.urlopen(request).read()
2663 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2664 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2665 return
2666
2667 # Search for the real file URL
2668 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2669 if (mobj is None) or (mobj.group(1) is None):
2670 # Try to figure out reason of the error.
2671 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2672 if (mobj is not None) and (mobj.group(1) is not None):
2673 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2674 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2675 else:
2676 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2677 return
2678
2679 file_url = mobj.group(1)
2680 file_extension = os.path.splitext(file_url)[1][1:]
2681
2682 # Search for file title
2683 mobj = re.search(r'<b title="(.*?)">', webpage)
2684 if mobj is None:
2685 self._downloader.trouble(u'ERROR: unable to extract title')
2686 return
2687 file_title = mobj.group(1).decode('utf-8')
2688
2689 try:
2690 # Process file information
2691 self._downloader.process_info({
2692 'id': file_id.decode('utf-8'),
2693 'url': file_url.decode('utf-8'),
2694 'uploader': u'NA',
2695 'upload_date': u'NA',
2696 'title': file_title,
2697 'stitle': file_title,
2698 'ext': file_extension.decode('utf-8'),
2699 'format': u'NA',
2700 'player_url': None,
2701 })
2702 except UnavailableVideoError, err:
2703 self._downloader.trouble(u'ERROR: unable to download file')
2704
c0a10ca8 2705
9f5f9602
GI
2706class FacebookIE(InfoExtractor):
2707 """Information Extractor for Facebook"""
2708
857e5f32 2709 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
9f5f9602
GI
2710 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2711 _NETRC_MACHINE = 'facebook'
0067bbe7 2712 _available_formats = ['video', 'highqual', 'lowqual']
9f5f9602 2713 _video_extensions = {
0067bbe7 2714 'video': 'mp4',
9f5f9602
GI
2715 'highqual': 'mp4',
2716 'lowqual': 'mp4',
2717 }
f3098c4d 2718 IE_NAME = u'facebook'
9f5f9602
GI
2719
2720 def __init__(self, downloader=None):
2721 InfoExtractor.__init__(self, downloader)
2722
9f5f9602
GI
2723 def _reporter(self, message):
2724 """Add header and report message."""
2725 self._downloader.to_screen(u'[facebook] %s' % message)
2726
2727 def report_login(self):
2728 """Report attempt to log in."""
2729 self._reporter(u'Logging in')
2730
2731 def report_video_webpage_download(self, video_id):
2732 """Report attempt to download video webpage."""
2733 self._reporter(u'%s: Downloading video webpage' % video_id)
2734
2735 def report_information_extraction(self, video_id):
2736 """Report attempt to extract video information."""
2737 self._reporter(u'%s: Extracting video information' % video_id)
2738
2739 def _parse_page(self, video_webpage):
2740 """Extract video information from page"""
2741 # General data
99e207ba 2742 data = {'title': r'\("video_title", "(.*?)"\)',
9f5f9602
GI
2743 'description': r'<div class="datawrap">(.*?)</div>',
2744 'owner': r'\("video_owner_name", "(.*?)"\)',
9f5f9602
GI
2745 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2746 }
2747 video_info = {}
2748 for piece in data.keys():
2749 mobj = re.search(data[piece], video_webpage)
2750 if mobj is not None:
2751 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2752
2753 # Video urls
2754 video_urls = {}
2755 for fmt in self._available_formats:
2756 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2757 if mobj is not None:
2758 # URL is in a Javascript segment inside an escaped Unicode format within
2759 # the generally utf-8 page
2760 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2761 video_info['video_urls'] = video_urls
2762
2763 return video_info
2764
2765 def _real_initialize(self):
2766 if self._downloader is None:
2767 return
2768
2769 useremail = None
2770 password = None
2771 downloader_params = self._downloader.params
2772
2773 # Attempt to use provided username and password or .netrc data
2774 if downloader_params.get('username', None) is not None:
2775 useremail = downloader_params['username']
2776 password = downloader_params['password']
2777 elif downloader_params.get('usenetrc', False):
2778 try:
2779 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2780 if info is not None:
2781 useremail = info[0]
2782 password = info[2]
2783 else:
2784 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2785 except (IOError, netrc.NetrcParseError), err:
2786 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2787 return
2788
2789 if useremail is None:
2790 return
2791
2792 # Log in
2793 login_form = {
2794 'email': useremail,
2795 'pass': password,
2796 'login': 'Log+In'
2797 }
2798 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2799 try:
2800 self.report_login()
2801 login_results = urllib2.urlopen(request).read()
2802 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2803 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2804 return
2805 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2806 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2807 return
2808
2809 def _real_extract(self, url):
2810 mobj = re.match(self._VALID_URL, url)
2811 if mobj is None:
2812 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813 return
2814 video_id = mobj.group('ID')
2815
2816 # Get video webpage
2817 self.report_video_webpage_download(video_id)
2818 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2819 try:
2820 page = urllib2.urlopen(request)
2821 video_webpage = page.read()
2822 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2823 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2824 return
2825
2826 # Start extracting information
2827 self.report_information_extraction(video_id)
2828
2829 # Extract information
2830 video_info = self._parse_page(video_webpage)
2831
2832 # uploader
2833 if 'owner' not in video_info:
2834 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2835 return
2836 video_uploader = video_info['owner']
2837
2838 # title
2839 if 'title' not in video_info:
2840 self._downloader.trouble(u'ERROR: unable to extract video title')
2841 return
2842 video_title = video_info['title']
2843 video_title = video_title.decode('utf-8')
2844 video_title = sanitize_title(video_title)
2845
e092418d 2846 simple_title = _simplify_title(video_title)
9f5f9602
GI
2847
2848 # thumbnail image
2849 if 'thumbnail' not in video_info:
2850 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2851 video_thumbnail = ''
2852 else:
2853 video_thumbnail = video_info['thumbnail']
2854
2855 # upload date
2856 upload_date = u'NA'
2857 if 'upload_date' in video_info:
2858 upload_time = video_info['upload_date']
2859 timetuple = email.utils.parsedate_tz(upload_time)
2860 if timetuple is not None:
2861 try:
2862 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2863 except:
2864 pass
2865
2866 # description
8b95c387 2867 video_description = video_info.get('description', 'No description available.')
9f5f9602
GI
2868
2869 url_map = video_info['video_urls']
2870 if len(url_map.keys()) > 0:
2871 # Decide which formats to download
2872 req_format = self._downloader.params.get('format', None)
2873 format_limit = self._downloader.params.get('format_limit', None)
2874
2875 if format_limit is not None and format_limit in self._available_formats:
2876 format_list = self._available_formats[self._available_formats.index(format_limit):]
2877 else:
2878 format_list = self._available_formats
2879 existing_formats = [x for x in format_list if x in url_map]
2880 if len(existing_formats) == 0:
2881 self._downloader.trouble(u'ERROR: no known formats available for video')
2882 return
2883 if req_format is None:
2884 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
c52b01f3
K
2885 elif req_format == 'worst':
2886 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
9f5f9602
GI
2887 elif req_format == '-1':
2888 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2889 else:
2890 # Specific format
2891 if req_format not in url_map:
2892 self._downloader.trouble(u'ERROR: requested format not available')
2893 return
2894 video_url_list = [(req_format, url_map[req_format])] # Specific format
2895
2896 for format_param, video_real_url in video_url_list:
2897
2898 # At this point we have a new video
2899 self._downloader.increment_downloads()
2900
2901 # Extension
2902 video_extension = self._video_extensions.get(format_param, 'mp4')
2903
9f5f9602
GI
2904 try:
2905 # Process video information
2906 self._downloader.process_info({
2907 'id': video_id.decode('utf-8'),
2908 'url': video_real_url.decode('utf-8'),
2909 'uploader': video_uploader.decode('utf-8'),
2910 'upload_date': upload_date,
2911 'title': video_title,
2912 'stitle': simple_title,
2913 'ext': video_extension.decode('utf-8'),
2914 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2915 'thumbnail': video_thumbnail.decode('utf-8'),
2916 'description': video_description.decode('utf-8'),
2917 'player_url': None,
2918 })
2919 except UnavailableVideoError, err:
2920 self._downloader.trouble(u'\nERROR: unable to download video')
2921
7745f5d8
PH
2922class BlipTVIE(InfoExtractor):
2923 """Information extractor for blip.tv"""
2924
1cab2c6d 2925 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
7745f5d8 2926 _URL_EXT = r'^.*\.([a-z0-9]+)$'
f3098c4d 2927 IE_NAME = u'blip.tv'
7745f5d8 2928
7745f5d8
PH
2929 def report_extraction(self, file_id):
2930 """Report information extraction."""
54f329fe
PH
2931 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2932
2933 def report_direct_download(self, title):
2934 """Report information extraction."""
2935 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
7745f5d8 2936
7745f5d8
PH
2937 def _real_extract(self, url):
2938 mobj = re.match(self._VALID_URL, url)
2939 if mobj is None:
2940 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2941 return
2942
1293ce58
PH
2943 if '?' in url:
2944 cchar = '&'
2945 else:
2946 cchar = '?'
2947 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
7745f5d8 2948 request = urllib2.Request(json_url)
aded78d9 2949 self.report_extraction(mobj.group(1))
54f329fe 2950 info = None
7745f5d8 2951 try:
54f329fe
PH
2952 urlh = urllib2.urlopen(request)
2953 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2954 basename = url.split('/')[-1]
2955 title,ext = os.path.splitext(basename)
468c9925 2956 title = title.decode('UTF-8')
54f329fe
PH
2957 ext = ext.replace('.', '')
2958 self.report_direct_download(title)
2959 info = {
2960 'id': title,
2961 'url': url,
2962 'title': title,
e092418d 2963 'stitle': _simplify_title(title),
54f329fe
PH
2964 'ext': ext,
2965 'urlhandle': urlh
2966 }
7745f5d8
PH
2967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2968 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2969 return
54f329fe
PH
2970 if info is None: # Regular URL
2971 try:
2972 json_code = urlh.read()
2973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2975 return
7745f5d8 2976
54f329fe
PH
2977 try:
2978 json_data = json.loads(json_code)
2979 if 'Post' in json_data:
2980 data = json_data['Post']
2981 else:
2982 data = json_data
2983
2984 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2985 video_url = data['media']['url']
2986 umobj = re.match(self._URL_EXT, video_url)
2987 if umobj is None:
2988 raise ValueError('Can not determine filename extension')
2989 ext = umobj.group(1)
2990
2991 info = {
2992 'id': data['item_id'],
2993 'url': video_url,
2994 'uploader': data['display_name'],
2995 'upload_date': upload_date,
2996 'title': data['title'],
e092418d 2997 'stitle': _simplify_title(data['title']),
54f329fe
PH
2998 'ext': ext,
2999 'format': data['media']['mimeType'],
3000 'thumbnail': data['thumbnailUrl'],
3001 'description': data['description'],
3002 'player_url': data['embedUrl']
3003 }
3004 except (ValueError,KeyError), err:
3005 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3006 return
a1cab7ce 3007
54f329fe 3008 self._downloader.increment_downloads()
7745f5d8
PH
3009
3010 try:
3011 self._downloader.process_info(info)
3012 except UnavailableVideoError, err:
3013 self._downloader.trouble(u'\nERROR: unable to download video')
3014
3015
9b0a8bc1
PH
3016class MyVideoIE(InfoExtractor):
3017 """Information Extractor for myvideo.de."""
3018
3019 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
f3098c4d 3020 IE_NAME = u'myvideo'
9b0a8bc1
PH
3021
3022 def __init__(self, downloader=None):
3023 InfoExtractor.__init__(self, downloader)
3024
9b0a8bc1
PH
3025 def report_download_webpage(self, video_id):
3026 """Report webpage download."""
3027 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3028
3029 def report_extraction(self, video_id):
3030 """Report information extraction."""
3031 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3032
9b0a8bc1
PH
3033 def _real_extract(self,url):
3034 mobj = re.match(self._VALID_URL, url)
3035 if mobj is None:
3036 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3037 return
3038
3039 video_id = mobj.group(1)
9b0a8bc1
PH
3040
3041 # Get video webpage
3042 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3043 try:
3044 self.report_download_webpage(video_id)
3045 webpage = urllib2.urlopen(request).read()
3046 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3047 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3048 return
3049
3050 self.report_extraction(video_id)
3051 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3052 webpage)
3053 if mobj is None:
3054 self._downloader.trouble(u'ERROR: unable to extract media URL')
3055 return
3056 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3057
3058 mobj = re.search('<title>([^<]+)</title>', webpage)
3059 if mobj is None:
3060 self._downloader.trouble(u'ERROR: unable to extract title')
3061 return
3062
3063 video_title = mobj.group(1)
3064 video_title = sanitize_title(video_title)
3065
e092418d
PH
3066 simple_title = _simplify_title(video_title)
3067
9b0a8bc1 3068 try:
9b0a8bc1
PH
3069 self._downloader.process_info({
3070 'id': video_id,
3071 'url': video_url,
3072 'uploader': u'NA',
3073 'upload_date': u'NA',
3074 'title': video_title,
3075 'stitle': simple_title,
3076 'ext': u'flv',
3077 'format': u'NA',
3078 'player_url': None,
3079 })
3080 except UnavailableVideoError:
3081 self._downloader.trouble(u'\nERROR: Unable to download video')
3082
c8e30044 3083class ComedyCentralIE(InfoExtractor):
f166bccc 3084 """Information extractor for The Daily Show and Colbert Report """
c8e30044 3085
f3098c4d
PH
3086 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3087 IE_NAME = u'comedycentral'
c8e30044 3088
c8e30044
PH
3089 def report_extraction(self, episode_id):
3090 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3091
3092 def report_config_download(self, episode_id):
3093 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3094
b487ef08
PH
3095 def report_index_download(self, episode_id):
3096 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3097
fedf9f39
PH
3098 def report_player_url(self, episode_id):
3099 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3100
c8e30044
PH
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3103 if mobj is None:
3104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105 return
f166bccc
PH
3106
3107 if mobj.group('shortname'):
3108 if mobj.group('shortname') in ('tds', 'thedailyshow'):
468c9925 3109 url = u'http://www.thedailyshow.com/full-episodes/'
f166bccc 3110 else:
468c9925 3111 url = u'http://www.colbertnation.com/full-episodes/'
f166bccc
PH
3112 mobj = re.match(self._VALID_URL, url)
3113 assert mobj is not None
3114
3115 dlNewest = not mobj.group('episode')
3116 if dlNewest:
3117 epTitle = mobj.group('showname')
3118 else:
3119 epTitle = mobj.group('episode')
c8e30044
PH
3120
3121 req = urllib2.Request(url)
3122 self.report_extraction(epTitle)
3123 try:
f166bccc
PH
3124 htmlHandle = urllib2.urlopen(req)
3125 html = htmlHandle.read()
c8e30044
PH
3126 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3127 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3128 return
f166bccc
PH
3129 if dlNewest:
3130 url = htmlHandle.geturl()
3131 mobj = re.match(self._VALID_URL, url)
3132 if mobj is None:
3133 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3134 return
3135 if mobj.group('episode') == '':
3136 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3137 return
3138 epTitle = mobj.group('episode')
c8e30044 3139
b487ef08 3140 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
c8e30044
PH
3141 if len(mMovieParams) == 0:
3142 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3143 return
b487ef08
PH
3144
3145 playerUrl_raw = mMovieParams[0][0]
fedf9f39
PH
3146 self.report_player_url(epTitle)
3147 try:
b487ef08
PH
3148 urlHandle = urllib2.urlopen(playerUrl_raw)
3149 playerUrl = urlHandle.geturl()
3150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3152 return
3153
3154 uri = mMovieParams[0][1]
3155 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3156 self.report_index_download(epTitle)
3157 try:
3158 indexXml = urllib2.urlopen(indexUrl).read()
fedf9f39 3159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
b487ef08 3160 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
fedf9f39 3161 return
fedf9f39 3162
b487ef08
PH
3163 idoc = xml.etree.ElementTree.fromstring(indexXml)
3164 itemEls = idoc.findall('.//item')
3165 for itemEl in itemEls:
3166 mediaId = itemEl.findall('./guid')[0].text
3167 shortMediaId = mediaId.split(':')[-1]
3168 showId = mediaId.split(':')[-2].replace('.com', '')
3169 officialTitle = itemEl.findall('./title')[0].text
3170 officialDate = itemEl.findall('./pubDate')[0].text
3171
c8e30044
PH
3172 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3173 urllib.urlencode({'uri': mediaId}))
3174 configReq = urllib2.Request(configUrl)
3175 self.report_config_download(epTitle)
3176 try:
3177 configXml = urllib2.urlopen(configReq).read()
3178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3180 return
46c8c432 3181
c8e30044
PH
3182 cdoc = xml.etree.ElementTree.fromstring(configXml)
3183 turls = []
3184 for rendition in cdoc.findall('.//rendition'):
3185 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3186 turls.append(finfo)
3187
a88bc6bb 3188 if len(turls) == 0:
b487ef08 3189 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
a88bc6bb
PH
3190 continue
3191
c8e30044
PH
3192 # For now, just pick the highest bitrate
3193 format,video_url = turls[-1]
3194
3195 self._downloader.increment_downloads()
a88bc6bb 3196
468c9925 3197 effTitle = showId + u'-' + epTitle
c8e30044 3198 info = {
b487ef08 3199 'id': shortMediaId,
c8e30044 3200 'url': video_url,
b487ef08
PH
3201 'uploader': showId,
3202 'upload_date': officialDate,
a88bc6bb 3203 'title': effTitle,
208e095f 3204 'stitle': _simplify_title(effTitle),
c8e30044
PH
3205 'ext': 'mp4',
3206 'format': format,
3207 'thumbnail': None,
b487ef08
PH
3208 'description': officialTitle,
3209 'player_url': playerUrl
c8e30044 3210 }
46c8c432 3211
c8e30044
PH
3212 try:
3213 self._downloader.process_info(info)
3214 except UnavailableVideoError, err:
b487ef08 3215 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
a88bc6bb 3216 continue
c8e30044
PH
3217
3218
f9c68787
PH
3219class EscapistIE(InfoExtractor):
3220 """Information extractor for The Escapist """
3221
b845d58b 3222 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
f3098c4d 3223 IE_NAME = u'escapist'
f9c68787 3224
f9c68787
PH
3225 def report_extraction(self, showName):
3226 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3227
3228 def report_config_download(self, showName):
3229 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3230
f9c68787
PH
3231 def _real_extract(self, url):
3232 htmlParser = HTMLParser.HTMLParser()
3233
3234 mobj = re.match(self._VALID_URL, url)
3235 if mobj is None:
3236 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3237 return
3238 showName = mobj.group('showname')
3239 videoId = mobj.group('episode')
3240
3241 self.report_extraction(showName)
3242 try:
3243 webPage = urllib2.urlopen(url).read()
3244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3245 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3246 return
3247
3248 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3249 description = htmlParser.unescape(descMatch.group(1))
3250 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3251 imgUrl = htmlParser.unescape(imgMatch.group(1))
3252 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3253 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3254 configUrlMatch = re.search('config=(.*)$', playerUrl)
3255 configUrl = urllib2.unquote(configUrlMatch.group(1))
3256
3257 self.report_config_download(showName)
3258 try:
3259 configJSON = urllib2.urlopen(configUrl).read()
3260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3261 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3262 return
3263
3264 # Technically, it's JavaScript, not JSON
3265 configJSON = configJSON.replace("'", '"')
3266
3267 try:
3268 config = json.loads(configJSON)
3269 except (ValueError,), err:
3270 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3271 return
3272
3273 playlist = config['playlist']
3274 videoUrl = playlist[1]['url']
3275
3276 self._downloader.increment_downloads()
3277 info = {
3278 'id': videoId,
3279 'url': videoUrl,
3280 'uploader': showName,
3281 'upload_date': None,
3282 'title': showName,
e092418d 3283 'stitle': _simplify_title(showName),
f9c68787
PH
3284 'ext': 'flv',
3285 'format': 'flv',
3286 'thumbnail': imgUrl,
3287 'description': description,
3288 'player_url': playerUrl,
3289 }
3290
3291 try:
3292 self._downloader.process_info(info)
3293 except UnavailableVideoError, err:
3294 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3295
3296
8d89fbae
PH
3297class CollegeHumorIE(InfoExtractor):
3298 """Information extractor for collegehumor.com"""
3299
3300 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3301 IE_NAME = u'collegehumor'
3302
3303 def report_webpage(self, video_id):
3304 """Report information extraction."""
3305 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3306
3307 def report_extraction(self, video_id):
3308 """Report information extraction."""
3309 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3310
8d89fbae
PH
3311 def _real_extract(self, url):
3312 htmlParser = HTMLParser.HTMLParser()
3313
3314 mobj = re.match(self._VALID_URL, url)
3315 if mobj is None:
3316 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3317 return
3318 video_id = mobj.group('videoid')
3319
3320 self.report_webpage(video_id)
3321 request = urllib2.Request(url)
3322 try:
3323 webpage = urllib2.urlopen(request).read()
3324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3326 return
3327
3328 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3329 if m is None:
3330 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3331 return
3332 internal_video_id = m.group('internalvideoid')
3333
3334 info = {
3335 'id': video_id,
3336 'internal_id': internal_video_id,
3337 }
3338
3339 self.report_extraction(video_id)
3340 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3341 try:
3342 metaXml = urllib2.urlopen(xmlUrl).read()
3343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3344 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3345 return
3346
3347 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3348 try:
3349 videoNode = mdoc.findall('./video')[0]
3350 info['description'] = videoNode.findall('./description')[0].text
3351 info['title'] = videoNode.findall('./caption')[0].text
e092418d 3352 info['stitle'] = _simplify_title(info['title'])
8d89fbae
PH
3353 info['url'] = videoNode.findall('./file')[0].text
3354 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3355 info['ext'] = info['url'].rpartition('.')[2]
3356 info['format'] = info['ext']
3357 except IndexError:
3358 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3359 return
3360
3361 self._downloader.increment_downloads()
3362
3363 try:
3364 self._downloader.process_info(info)
3365 except UnavailableVideoError, err:
3366 self._downloader.trouble(u'\nERROR: unable to download video')
3367
f9c68787 3368
6501a06d
RB
3369class XVideosIE(InfoExtractor):
3370 """Information extractor for xvideos.com"""
3371
3372 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3373 IE_NAME = u'xvideos'
3374
3375 def report_webpage(self, video_id):
3376 """Report information extraction."""
3377 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3378
3379 def report_extraction(self, video_id):
3380 """Report information extraction."""
3381 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3382
6501a06d
RB
3383 def _real_extract(self, url):
3384 htmlParser = HTMLParser.HTMLParser()
3385
3386 mobj = re.match(self._VALID_URL, url)
3387 if mobj is None:
3388 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3389 return
3390 video_id = mobj.group(1).decode('utf-8')
3391
3392 self.report_webpage(video_id)
3393
a1a8713a 3394 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
6501a06d
RB
3395 try:
3396 webpage = urllib2.urlopen(request).read()
3397 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3398 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3399 return
3400
3401 self.report_extraction(video_id)
3402
3403
3404 # Extract video URL
3405 mobj = re.search(r'flv_url=(.+?)&', webpage)
3406 if mobj is None:
9f47175a 3407 self._downloader.trouble(u'ERROR: unable to extract video url')
6501a06d
RB
3408 return
3409 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3410
3411
3412 # Extract title
0f9b7722 3413 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
6501a06d
RB
3414 if mobj is None:
3415 self._downloader.trouble(u'ERROR: unable to extract video title')
3416 return
3417 video_title = mobj.group(1).decode('utf-8')
3418
3419
3420 # Extract video thumbnail
3421 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3422 if mobj is None:
3423 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3424 return
3425 video_thumbnail = mobj.group(1).decode('utf-8')
3426
3427
3428
3429 self._downloader.increment_downloads()
3430 info = {
3431 'id': video_id,
3432 'url': video_url,
3433 'uploader': None,
3434 'upload_date': None,
3435 'title': video_title,
e092418d 3436 'stitle': _simplify_title(video_title),
6501a06d
RB
3437 'ext': 'flv',
3438 'format': 'flv',
3439 'thumbnail': video_thumbnail,
3440 'description': None,
3441 'player_url': None,
3442 }
3443
3444 try:
3445 self._downloader.process_info(info)
3446 except UnavailableVideoError, err:
3447 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3448
3449
b20d4f86 3450class SoundcloudIE(InfoExtractor):
073d7a59 3451 """Information extractor for soundcloud.com
b20d4f86
KN
3452 To access the media, the uid of the song and a stream token
3453 must be extracted from the page source and the script must make
3454 a request to media.soundcloud.com/crossdomain.xml. Then
3455 the media can be grabbed by requesting from an url composed
3456 of the stream token and uid
3457 """
ecb3bfe5 3458
40306424 3459 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
ecb3bfe5
KN
3460 IE_NAME = u'soundcloud'
3461
b20d4f86
KN
3462 def __init__(self, downloader=None):
3463 InfoExtractor.__init__(self, downloader)
40306424
KN
3464
3465 def report_webpage(self, video_id):
3466 """Report information extraction."""
3467 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3468
3469 def report_extraction(self, video_id):
3470 """Report information extraction."""
3471 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3472
40306424
KN
3473 def _real_extract(self, url):
3474 htmlParser = HTMLParser.HTMLParser()
3475
3476 mobj = re.match(self._VALID_URL, url)
3477 if mobj is None:
3478 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3479 return
3480
b20d4f86
KN
3481 # extract uploader (which is in the url)
3482 uploader = mobj.group(1).decode('utf-8')
3483 # extract simple title (uploader + slug of song title)
3484 slug_title = mobj.group(2).decode('utf-8')
40306424
KN
3485 simple_title = uploader + '-' + slug_title
3486
3487 self.report_webpage('%s/%s' % (uploader, slug_title))
3488
3489 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3490 try:
3491 webpage = urllib2.urlopen(request).read()
3492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3493 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3494 return
3495
3496 self.report_extraction('%s/%s' % (uploader, slug_title))
3497
ec574c2c 3498 # extract uid and stream token that soundcloud hands out for access
5b3330e0 3499 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
b20d4f86 3500 if mobj:
871be928
KN
3501 video_id = mobj.group(1)
3502 stream_token = mobj.group(2)
b20d4f86 3503
ec574c2c
KN
3504 # extract unsimplified title
3505 mobj = re.search('"title":"(.*?)",', webpage)
3506 if mobj:
3507 title = mobj.group(1)
3508
3509 # construct media url (with uid/token)
b20d4f86
KN
3510 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3511 mediaURL = mediaURL % (video_id, stream_token)
3512
3513 # description
3514 description = u'No description available'
871be928 3515 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
b20d4f86
KN
3516 if mobj:
3517 description = mobj.group(1)
3518
3519 # upload date
871be928
KN
3520 upload_date = None
3521 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
b20d4f86
KN
3522 if mobj:
3523 try:
871be928 3524 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
0ae7abe5 3525 except Exception, e:
871be928 3526 print str(e)
b20d4f86 3527
ec574c2c 3528 # for soundcloud, a request to a cross domain is required for cookies
b20d4f86
KN
3529 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3530
3531 try:
3532 self._downloader.process_info({
ec574c2c 3533 'id': video_id.decode('utf-8'),
871be928 3534 'url': mediaURL,
ec574c2c 3535 'uploader': uploader.decode('utf-8'),
073d7a59 3536 'upload_date': upload_date,
ec574c2c
KN
3537 'title': simple_title.decode('utf-8'),
3538 'stitle': simple_title.decode('utf-8'),
40306424
KN
3539 'ext': u'mp3',
3540 'format': u'NA',
3541 'player_url': None,
ec574c2c 3542 'description': description.decode('utf-8')
b20d4f86
KN
3543 })
3544 except UnavailableVideoError:
3545 self._downloader.trouble(u'\nERROR: unable to download video')
ecb3bfe5 3546
208c4b91 3547
3b98a5dd
OA
3548class InfoQIE(InfoExtractor):
3549 """Information extractor for infoq.com"""
3550
3551 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3552 IE_NAME = u'infoq'
3553
3554 def report_webpage(self, video_id):
3555 """Report information extraction."""
3556 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3557
3558 def report_extraction(self, video_id):
3559 """Report information extraction."""
3560 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3561
3b98a5dd
OA
3562 def _real_extract(self, url):
3563 htmlParser = HTMLParser.HTMLParser()
3564
3565 mobj = re.match(self._VALID_URL, url)
3566 if mobj is None:
3567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568 return
3569
3570 self.report_webpage(url)
3571
3572 request = urllib2.Request(url)
3573 try:
3574 webpage = urllib2.urlopen(request).read()
3575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3576 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3577 return
3578
3579 self.report_extraction(url)
3580
3581
3582 # Extract video URL
3583 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3584 if mobj is None:
3585 self._downloader.trouble(u'ERROR: unable to extract video url')
3586 return
3587 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3588
3589
3590 # Extract title
3591 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3592 if mobj is None:
3593 self._downloader.trouble(u'ERROR: unable to extract video title')
3594 return
3595 video_title = mobj.group(1).decode('utf-8')
3596
3b98a5dd
OA
3597 # Extract description
3598 video_description = u'No description available.'
3599 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3600 if mobj is not None:
3601 video_description = mobj.group(1).decode('utf-8')
3602
3603 video_filename = video_url.split('/')[-1]
3604 video_id, extension = video_filename.split('.')
3605
3606 self._downloader.increment_downloads()
3607 info = {
3608 'id': video_id,
3609 'url': video_url,
3610 'uploader': None,
3611 'upload_date': None,
3612 'title': video_title,
e092418d 3613 'stitle': _simplify_title(video_title),
3b98a5dd
OA
3614 'ext': extension,
3615 'format': extension, # Extension is always(?) mp4, but seems to be flv
3616 'thumbnail': None,
3617 'description': video_description,
3618 'player_url': None,
3619 }
3620
3621 try:
3622 self._downloader.process_info(info)
3623 except UnavailableVideoError, err:
3624 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3625
b158a1d9 3626class MixcloudIE(InfoExtractor):
3627 """Information extractor for www.mixcloud.com"""
3628 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3629 IE_NAME = u'mixcloud'
3b98a5dd 3630
b158a1d9 3631 def __init__(self, downloader=None):
3632 InfoExtractor.__init__(self, downloader)
3633
3634 def report_download_json(self, file_id):
3635 """Report JSON download."""
3636 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3637
3638 def report_extraction(self, file_id):
3639 """Report information extraction."""
3640 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3641
3642 def get_urls(self, jsonData, fmt, bitrate='best'):
3643 """Get urls from 'audio_formats' section in json"""
3644 file_url = None
3645 try:
3646 bitrate_list = jsonData[fmt]
3647 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3648 bitrate = max(bitrate_list) # select highest
3649
3650 url_list = jsonData[fmt][bitrate]
3651 except TypeError: # we have no bitrate info.
3652 url_list = jsonData[fmt]
3653
3654 return url_list
849edab8 3655
b158a1d9 3656 def check_urls(self, url_list):
3657 """Returns 1st active url from list"""
3658 for url in url_list:
3659 try:
3660 urllib2.urlopen(url)
3661 return url
3662 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3663 url = None
3664
3665 return None
3666
3667 def _print_formats(self, formats):
3668 print 'Available formats:'
3669 for fmt in formats.keys():
3670 for b in formats[fmt]:
3671 try:
3672 ext = formats[fmt][b][0]
3673 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3674 except TypeError: # we have no bitrate info
3675 ext = formats[fmt][0]
3676 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3677 break
3678
3679 def _real_extract(self, url):
3680 mobj = re.match(self._VALID_URL, url)
3681 if mobj is None:
3682 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3683 return
3684 # extract uploader & filename from url
3685 uploader = mobj.group(1).decode('utf-8')
3686 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3687
3688 # construct API request
3689 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3690 # retrieve .json file with links to files
3691 request = urllib2.Request(file_url)
3692 try:
3693 self.report_download_json(file_url)
3694 jsonData = urllib2.urlopen(request).read()
3695 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3696 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3697 return
3698
3699 # parse JSON
3700 json_data = json.loads(jsonData)
3701 player_url = json_data['player_swf_url']
3702 formats = dict(json_data['audio_formats'])
3703
3704 req_format = self._downloader.params.get('format', None)
3705 bitrate = None
3706
3707 if self._downloader.params.get('listformats', None):
3708 self._print_formats(formats)
3709 return
3710
3711 if req_format is None or req_format == 'best':
3712 for format_param in formats.keys():
3713 url_list = self.get_urls(formats, format_param)
3714 # check urls
3715 file_url = self.check_urls(url_list)
3716 if file_url is not None:
3717 break # got it!
3718 else:
3719 if req_format not in formats.keys():
3720 self._downloader.trouble(u'ERROR: format is not available')
3721 return
3722
3723 url_list = self.get_urls(formats, req_format)
3724 file_url = self.check_urls(url_list)
3725 format_param = req_format
3726
3727 # We have audio
3728 self._downloader.increment_downloads()
3729 try:
3730 # Process file information
3731 self._downloader.process_info({
3732 'id': file_id.decode('utf-8'),
3733 'url': file_url.decode('utf-8'),
3734 'uploader': uploader.decode('utf-8'),
3735 'upload_date': u'NA',
3736 'title': json_data['name'],
3737 'stitle': _simplify_title(json_data['name']),
3738 'ext': file_url.split('.')[-1].decode('utf-8'),
3739 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3740 'thumbnail': json_data['thumbnail_url'],
3741 'description': json_data['description'],
3742 'player_url': player_url.decode('utf-8'),
3743 })
3744 except UnavailableVideoError, err:
3745 self._downloader.trouble(u'ERROR: unable to download file')
00f95a93 3746
849edab8
PH
3747
3748
65cd34c5
RG
3749class PostProcessor(object):
3750 """Post Processor class.
3751
3752 PostProcessor objects can be added to downloaders with their
3753 add_post_processor() method. When the downloader has finished a
3754 successful download, it will take its internal chain of PostProcessors
3755 and start calling the run() method on each one of them, first with
3756 an initial argument and then with the returned value of the previous
3757 PostProcessor.
3758
3759 The chain will be stopped if one of them ever returns None or the end
3760 of the chain is reached.
3761
3762 PostProcessor objects follow a "mutual registration" process similar
3763 to InfoExtractor objects.
3764 """
3765
3766 _downloader = None
3767
3768 def __init__(self, downloader=None):
3769 self._downloader = downloader
3770
65cd34c5
RG
3771 def set_downloader(self, downloader):
3772 """Sets the downloader for this PP."""
3773 self._downloader = downloader
d3975459 3774
65cd34c5
RG
3775 def run(self, information):
3776 """Run the PostProcessor.
3777
3778 The "information" argument is a dictionary like the ones
2f11508a 3779 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
3780 one has an extra field called "filepath" that points to the
3781 downloaded file.
3782
3783 When this method returns None, the postprocessing chain is
3784 stopped. However, this method may return an information
3785 dictionary that will be passed to the next postprocessing
3786 object in the chain. It can be the one it received after
3787 changing some fields.
3788
3789 In addition, this method may raise a PostProcessingError
3790 exception that will be taken into account by the downloader
3791 it was called from.
3792 """
3793 return information # by default, do nothing
d3975459 3794
c0a10ca8 3795
3072fab1
RG
3796class FFmpegExtractAudioPP(PostProcessor):
3797
c99dcbd2 3798 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3072fab1
RG
3799 PostProcessor.__init__(self, downloader)
3800 if preferredcodec is None:
3801 preferredcodec = 'best'
3802 self._preferredcodec = preferredcodec
18b7f874 3803 self._preferredquality = preferredquality
3804 self._keepvideo = keepvideo
3072fab1
RG
3805
3806 @staticmethod
3807 def get_audio_codec(path):
da273188 3808 try:
2727dbf7
RG
3809 cmd = ['ffprobe', '-show_streams', '--', path]
3810 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
da273188
RG
3811 output = handle.communicate()[0]
3812 if handle.wait() != 0:
3813 return None
3814 except (IOError, OSError):
3072fab1
RG
3815 return None
3816 audio_codec = None
3817 for line in output.split('\n'):
3818 if line.startswith('codec_name='):
3819 audio_codec = line.split('=')[1].strip()
3820 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3821 return audio_codec
3822 return None
3823
3824 @staticmethod
3825 def run_ffmpeg(path, out_path, codec, more_opts):
3826 try:
2727dbf7
RG
3827 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3828 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3072fab1
RG
3829 return (ret == 0)
3830 except (IOError, OSError):
3831 return False
3832
3833 def run(self, information):
3834 path = information['filepath']
3835
3836 filecodec = self.get_audio_codec(path)
3837 if filecodec is None:
da273188 3838 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3072fab1
RG
3839 return None
3840
3841 more_opts = []
3842 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
efb113c7 3843 if filecodec in ['aac', 'mp3', 'vorbis']:
3072fab1
RG
3844 # Lossless if possible
3845 acodec = 'copy'
3846 extension = filecodec
3847 if filecodec == 'aac':
3848 more_opts = ['-f', 'adts']
58384838
RC
3849 if filecodec == 'vorbis':
3850 extension = 'ogg'
3072fab1
RG
3851 else:
3852 # MP3 otherwise.
3853 acodec = 'libmp3lame'
3854 extension = 'mp3'
c99dcbd2
PH
3855 more_opts = []
3856 if self._preferredquality is not None:
3857 more_opts += ['-ab', self._preferredquality]
3072fab1
RG
3858 else:
3859 # We convert the audio (lossy)
58384838 3860 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3072fab1 3861 extension = self._preferredcodec
c99dcbd2
PH
3862 more_opts = []
3863 if self._preferredquality is not None:
3864 more_opts += ['-ab', self._preferredquality]
3072fab1
RG
3865 if self._preferredcodec == 'aac':
3866 more_opts += ['-f', 'adts']
58384838
RC
3867 if self._preferredcodec == 'vorbis':
3868 extension = 'ogg'
3072fab1
RG
3869
3870 (prefix, ext) = os.path.splitext(path)
3871 new_path = prefix + '.' + extension
3872 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3873 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3874
3875 if not status:
1bd92582 3876 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3072fab1
RG
3877 return None
3878
36597dc4
K
3879 # Try to update the date time for extracted audio file.
3880 if information.get('filetime') is not None:
3881 try:
3882 os.utime(new_path, (time.time(), information['filetime']))
3883 except:
3884 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3885
18b7f874 3886 if not self._keepvideo:
3887 try:
3888 os.remove(path)
3889 except (IOError, OSError):
3890 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3891 return None
3072fab1
RG
3892
3893 information['filepath'] = new_path
3894 return information
3895
5fb3df4a
GV
3896
3897def updateSelf(downloader, filename):
3898 ''' Update the program file with the latest version from the repository '''
3899 # Note: downloader only used for options
3900 if not os.access(filename, os.W_OK):
3901 sys.exit('ERROR: no write permissions on %s' % filename)
3902
d207e7cf 3903 downloader.to_screen('Updating to latest version...')
5fb3df4a 3904
4fa74b52 3905 try:
d207e7cf
PH
3906 try:
3907 urlh = urllib.urlopen(UPDATE_URL)
3908 newcontent = urlh.read()
27365956
PH
3909
3910 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3911 if vmatch is not None and vmatch.group(1) == __version__:
3912 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3913 return
d207e7cf
PH
3914 finally:
3915 urlh.close()
5fb3df4a
GV
3916 except (IOError, OSError), err:
3917 sys.exit('ERROR: unable to download latest version')
f9f1e798 3918
5fb3df4a 3919 try:
d207e7cf
PH
3920 outf = open(filename, 'wb')
3921 try:
3922 outf.write(newcontent)
3923 finally:
3924 outf.close()
5fb3df4a
GV
3925 except (IOError, OSError), err:
3926 sys.exit('ERROR: unable to overwrite current version')
4bec29ef 3927
eb6c37da 3928 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
80066952 3929
4f9f96f6
GV
3930def parseOpts():
3931 # Deferred imports
3932 import getpass
3933 import optparse
c379c181
PH
3934 import shlex
3935
3936 def _readOptions(filename):
3937 try:
3938 optionf = open(filename)
3939 except IOError:
3940 return [] # silently skip if file is not present
3941 try:
3942 res = []
3943 for l in optionf:
3944 res += shlex.split(l, comments=True)
3945 finally:
3946 optionf.close()
3947 return res
e7cf18cb 3948
4f9f96f6
GV
3949 def _format_option_string(option):
3950 ''' ('-o', '--option') -> -o, --format METAVAR'''
80066952 3951
4f9f96f6
GV
3952 opts = []
3953
3954 if option._short_opts: opts.append(option._short_opts[0])
3955 if option._long_opts: opts.append(option._long_opts[0])
3956 if len(opts) > 1: opts.insert(1, ', ')
3957
3958 if option.takes_value(): opts.append(' %s' % option.metavar)
3959
3960 return "".join(opts)
3961
6a4f0a11
GV
3962 def _find_term_columns():
3963 columns = os.environ.get('COLUMNS', None)
2c8d32de
PH
3964 if columns:
3965 return int(columns)
3966
4f2a5e06
PH
3967 try:
3968 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3969 out,err = sp.communicate()
eb0387a8 3970 return int(out.split()[1])
4f2a5e06
PH
3971 except:
3972 pass
2c8d32de 3973 return None
6a4f0a11 3974
51c8e53f
GV
3975 max_width = 80
3976 max_help_position = 80
3977
3978 # No need to wrap help messages if we're on a wide console
6a4f0a11 3979 columns = _find_term_columns()
51c8e53f
GV
3980 if columns: max_width = columns
3981
3982 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4f9f96f6
GV
3983 fmt.format_option_strings = _format_option_string
3984
3985 kw = {
3986 'version' : __version__,
3987 'formatter' : fmt,
a2f7e3a5 3988 'usage' : '%prog [options] url [url...]',
4f9f96f6
GV
3989 'conflict_handler' : 'resolve',
3990 }
3991
3992 parser = optparse.OptionParser(**kw)
3993
3994 # option groups
3995 general = optparse.OptionGroup(parser, 'General Options')
20e91e83 3996 selection = optparse.OptionGroup(parser, 'Video Selection')
4f9f96f6
GV
3997 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3998 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3999 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4000 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4001 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4002
4003 general.add_option('-h', '--help',
4004 action='help', help='print this help text and exit')
4005 general.add_option('-v', '--version',
4006 action='version', help='print program version and exit')
4007 general.add_option('-U', '--update',
e0e56865 4008 action='store_true', dest='update_self', help='update this program to latest version')
4f9f96f6
GV
4009 general.add_option('-i', '--ignore-errors',
4010 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4011 general.add_option('-r', '--rate-limit',
4012 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4013 general.add_option('-R', '--retries',
4014 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4f9f96f6
GV
4015 general.add_option('--dump-user-agent',
4016 action='store_true', dest='dump_user_agent',
4017 help='display the current browser identification', default=False)
f3098c4d
PH
4018 general.add_option('--list-extractors',
4019 action='store_true', dest='list_extractors',
4020 help='List all supported extractors and the URLs they would handle', default=False)
4f9f96f6 4021
20e91e83
ABP
4022 selection.add_option('--playlist-start',
4023 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4024 selection.add_option('--playlist-end',
4025 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4026 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4027 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
b88a5250 4028 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
20e91e83 4029
4f9f96f6
GV
4030 authentication.add_option('-u', '--username',
4031 dest='username', metavar='USERNAME', help='account username')
4032 authentication.add_option('-p', '--password',
4033 dest='password', metavar='PASSWORD', help='account password')
4034 authentication.add_option('-n', '--netrc',
4035 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4036
4037
4038 video_format.add_option('-f', '--format',
4039 action='store', dest='format', metavar='FORMAT', help='video format code')
4040 video_format.add_option('--all-formats',
5260e68f 4041 action='store_const', dest='format', help='download all available video formats', const='all')
4f9f96f6
GV
4042 video_format.add_option('--max-quality',
4043 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2761012f
PH
4044 video_format.add_option('-F', '--list-formats',
4045 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4f9f96f6
GV
4046
4047
4048 verbosity.add_option('-q', '--quiet',
4049 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4050 verbosity.add_option('-s', '--simulate',
9b4556c4
PH
4051 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4052 verbosity.add_option('--skip-download',
4053 action='store_true', dest='skip_download', help='do not download the video', default=False)
4f9f96f6
GV
4054 verbosity.add_option('-g', '--get-url',
4055 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4056 verbosity.add_option('-e', '--get-title',
4057 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4058 verbosity.add_option('--get-thumbnail',
4059 action='store_true', dest='getthumbnail',
4060 help='simulate, quiet but print thumbnail URL', default=False)
4061 verbosity.add_option('--get-description',
4062 action='store_true', dest='getdescription',
4063 help='simulate, quiet but print video description', default=False)
4064 verbosity.add_option('--get-filename',
4065 action='store_true', dest='getfilename',
4066 help='simulate, quiet but print output filename', default=False)
da0db53a
DH
4067 verbosity.add_option('--get-format',
4068 action='store_true', dest='getformat',
4069 help='simulate, quiet but print output format', default=False)
4f9f96f6
GV
4070 verbosity.add_option('--no-progress',
4071 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4072 verbosity.add_option('--console-title',
4073 action='store_true', dest='consoletitle',
4074 help='display progress in console titlebar', default=False)
4075
4076
4077 filesystem.add_option('-t', '--title',
4078 action='store_true', dest='usetitle', help='use title in file name', default=False)
4079 filesystem.add_option('-l', '--literal',
4080 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4081 filesystem.add_option('-A', '--auto-number',
4082 action='store_true', dest='autonumber',
4083 help='number downloaded files starting from 00000', default=False)
4084 filesystem.add_option('-o', '--output',
31a2ec2d 4085 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent')
4f9f96f6
GV
4086 filesystem.add_option('-a', '--batch-file',
4087 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4088 filesystem.add_option('-w', '--no-overwrites',
4089 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4090 filesystem.add_option('-c', '--continue',
c25303c3 4091 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
18bb3d1e
PH
4092 filesystem.add_option('--no-continue',
4093 action='store_false', dest='continue_dl',
4094 help='do not resume partially downloaded files (restart from beginning)')
4f9f96f6 4095 filesystem.add_option('--cookies',
abb870d1 4096 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4f9f96f6
GV
4097 filesystem.add_option('--no-part',
4098 action='store_true', dest='nopart', help='do not use .part files', default=False)
4099 filesystem.add_option('--no-mtime',
4100 action='store_false', dest='updatetime',
4101 help='do not use the Last-modified header to set the file modification time', default=True)
2c8d32de
PH
4102 filesystem.add_option('--write-description',
4103 action='store_true', dest='writedescription',
4104 help='write video description to a .description file', default=False)
4105 filesystem.add_option('--write-info-json',
4106 action='store_true', dest='writeinfojson',
4107 help='write video metadata to a .info.json file', default=False)
4f9f96f6
GV
4108
4109
4110 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4111 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4112 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
58384838 4113 help='"best", "aac", "vorbis" or "mp3"; best by default')
c99dcbd2
PH
4114 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4115 help='ffmpeg audio bitrate specification, 128k by default')
4116 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4117 help='keeps the video file on disk after the post-processing; the video is erased by default')
4f9f96f6
GV
4118
4119
4120 parser.add_option_group(general)
20e91e83 4121 parser.add_option_group(selection)
4f9f96f6
GV
4122 parser.add_option_group(filesystem)
4123 parser.add_option_group(verbosity)
4124 parser.add_option_group(video_format)
4125 parser.add_option_group(authentication)
4126 parser.add_option_group(postproc)
4127
0cd235ee
PH
4128 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4129 if xdg_config_home:
4130 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4131 else:
4132 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4133 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
c379c181 4134 opts, args = parser.parse_args(argv)
4f9f96f6
GV
4135
4136 return parser, opts, args
4137
f3098c4d
PH
4138def gen_extractors():
4139 """ Return a list of an instance of every supported extractor.
4140 The order does matter; the first extractor matched is the one handling the URL.
4141 """
4142 youtube_ie = YoutubeIE()
4143 google_ie = GoogleIE()
4144 yahoo_ie = YahooIE()
4145 return [
f3098c4d
PH
4146 YoutubePlaylistIE(youtube_ie),
4147 YoutubeUserIE(youtube_ie),
4148 YoutubeSearchIE(youtube_ie),
1cde6f1d
PH
4149 youtube_ie,
4150 MetacafeIE(youtube_ie),
4151 DailymotionIE(),
f3098c4d
PH
4152 google_ie,
4153 GoogleSearchIE(google_ie),
4154 PhotobucketIE(),
4155 yahoo_ie,
4156 YahooSearchIE(yahoo_ie),
4157 DepositFilesIE(),
4158 FacebookIE(),
4159 BlipTVIE(),
4160 VimeoIE(),
4161 MyVideoIE(),
4162 ComedyCentralIE(),
4163 EscapistIE(),
8d89fbae 4164 CollegeHumorIE(),
6501a06d 4165 XVideosIE(),
38348005 4166 SoundcloudIE(),
3b98a5dd 4167 InfoQIE(),
b158a1d9 4168 MixcloudIE(),
f3098c4d
PH
4169
4170 GenericIE()
4171 ]
4172
235b3ba4 4173def _real_main():
5adcaa43 4174 parser, opts, args = parseOpts()
4f9f96f6 4175
5adcaa43
GV
4176 # Open appropriate CookieJar
4177 if opts.cookiefile is None:
4178 jar = cookielib.CookieJar()
4179 else:
8cc44341 4180 try:
5adcaa43
GV
4181 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4182 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4183 jar.load()
4184 except (IOError, OSError), err:
4185 sys.exit(u'ERROR: unable to open cookie file')
80066952 4186
5adcaa43
GV
4187 # Dump user agent
4188 if opts.dump_user_agent:
4189 print std_headers['User-Agent']
4190 sys.exit(0)
e7cf18cb 4191
5adcaa43
GV
4192 # Batch file verification
4193 batchurls = []
4194 if opts.batchfile is not None:
8cc44341 4195 try:
5adcaa43
GV
4196 if opts.batchfile == '-':
4197 batchfd = sys.stdin
4bec29ef 4198 else:
5adcaa43
GV
4199 batchfd = open(opts.batchfile, 'r')
4200 batchurls = batchfd.readlines()
4201 batchurls = [x.strip() for x in batchurls]
4202 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4203 except IOError:
4204 sys.exit(u'ERROR: batch file could not be read')
4205 all_urls = batchurls + args
4206
f3098c4d
PH
4207 # General configuration
4208 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4209 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4210 urllib2.install_opener(opener)
4211 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4212
4213 extractors = gen_extractors()
4214
4215 if opts.list_extractors:
4216 for ie in extractors:
4217 print(ie.IE_NAME)
4218 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4219 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4220 for mu in matchedUrls:
4221 print(u' ' + mu)
4222 sys.exit(0)
4223
5adcaa43
GV
4224 # Conflicting, missing and erroneous options
4225 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4226 parser.error(u'using .netrc conflicts with giving username/password')
4227 if opts.password is not None and opts.username is None:
4228 parser.error(u'account username missing')
4229 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4230 parser.error(u'using output template conflicts with using title, literal title or auto number')
4231 if opts.usetitle and opts.useliteral:
4232 parser.error(u'using title conflicts with using literal title')
4233 if opts.username is not None and opts.password is None:
4234 opts.password = getpass.getpass(u'Type account password and press return:')
4235 if opts.ratelimit is not None:
4236 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4237 if numeric_limit is None:
4238 parser.error(u'invalid rate limit specified')
4239 opts.ratelimit = numeric_limit
4240 if opts.retries is not None:
8cc44341 4241 try:
5adcaa43 4242 opts.retries = long(opts.retries)
8cc44341 4243 except (TypeError, ValueError), err:
5adcaa43
GV
4244 parser.error(u'invalid retry count specified')
4245 try:
2c8d32de 4246 opts.playliststart = int(opts.playliststart)
5adcaa43 4247 if opts.playliststart <= 0:
2c8d32de 4248 raise ValueError(u'Playlist start must be positive')
5adcaa43
GV
4249 except (TypeError, ValueError), err:
4250 parser.error(u'invalid playlist start number specified')
4251 try:
2c8d32de 4252 opts.playlistend = int(opts.playlistend)
5adcaa43 4253 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2c8d32de 4254 raise ValueError(u'Playlist end must be greater than playlist start')
5adcaa43
GV
4255 except (TypeError, ValueError), err:
4256 parser.error(u'invalid playlist end number specified')
4257 if opts.extractaudio:
58384838 4258 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
5adcaa43
GV
4259 parser.error(u'invalid audio format specified')
4260
5adcaa43
GV
4261 # File downloader
4262 fd = FileDownloader({
4263 'usenetrc': opts.usenetrc,
4264 'username': opts.username,
4265 'password': opts.password,
da0db53a 4266 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
5adcaa43
GV
4267 'forceurl': opts.geturl,
4268 'forcetitle': opts.gettitle,
4269 'forcethumbnail': opts.getthumbnail,
4270 'forcedescription': opts.getdescription,
4271 'forcefilename': opts.getfilename,
da0db53a 4272 'forceformat': opts.getformat,
9b4556c4 4273 'simulate': opts.simulate,
da0db53a 4274 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
5adcaa43
GV
4275 'format': opts.format,
4276 'format_limit': opts.format_limit,
3de2a1e6 4277 'listformats': opts.listformats,
5adcaa43
GV
4278 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4279 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4280 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4281 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4282 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4283 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4284 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4285 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4286 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4287 or u'%(id)s.%(ext)s'),
4288 'ignoreerrors': opts.ignoreerrors,
4289 'ratelimit': opts.ratelimit,
4290 'nooverwrites': opts.nooverwrites,
4291 'retries': opts.retries,
4292 'continuedl': opts.continue_dl,
4293 'noprogress': opts.noprogress,
4294 'playliststart': opts.playliststart,
4295 'playlistend': opts.playlistend,
4296 'logtostderr': opts.outtmpl == '-',
4297 'consoletitle': opts.consoletitle,
4298 'nopart': opts.nopart,
4299 'updatetime': opts.updatetime,
2c8d32de
PH
4300 'writedescription': opts.writedescription,
4301 'writeinfojson': opts.writeinfojson,
20e91e83
ABP
4302 'matchtitle': opts.matchtitle,
4303 'rejecttitle': opts.rejecttitle,
c379c181 4304 'max_downloads': opts.max_downloads,
5adcaa43 4305 })
8c5dc3ad
PH
4306 for extractor in extractors:
4307 fd.add_info_extractor(extractor)
5adcaa43
GV
4308
4309 # PostProcessors
4310 if opts.extractaudio:
c99dcbd2 4311 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
5adcaa43
GV
4312
4313 # Update version
4314 if opts.update_self:
4315 updateSelf(fd, sys.argv[0])
4316
4317 # Maybe do nothing
4318 if len(all_urls) < 1:
4319 if not opts.update_self:
4320 parser.error(u'you must provide at least one URL')
4321 else:
4322 sys.exit()
4323 retcode = fd.download(all_urls)
80066952 4324
5adcaa43
GV
4325 # Dump cookie jar if requested
4326 if opts.cookiefile is not None:
4327 try:
4328 jar.save()
4329 except (IOError, OSError), err:
4330 sys.exit(u'ERROR: unable to save cookie jar')
80066952 4331
5adcaa43 4332 sys.exit(retcode)
80066952 4333
235b3ba4 4334def main():
5adcaa43 4335 try:
235b3ba4 4336 _real_main()
e5bf0f55
RG
4337 except DownloadError:
4338 sys.exit(1)
4339 except SameFileError:
76a7f364 4340 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 4341 except KeyboardInterrupt:
76a7f364 4342 sys.exit(u'\nERROR: Interrupted by user')
e9cb9c28 4343
235b3ba4
PH
4344if __name__ == '__main__':
4345 main()
4346
e9cb9c28 4347# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: