]> jfr.im git - yt-dlp.git/blame - youtube-dl
Do not claim copyright in README (Closes #157)
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
2770590d
GV
3
4__author__ = (
2c8d32de
PH
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
6ae796b1 12 'Rogério Brito',
eb11aacc 13 'Philipp Hagemeister',
6fc5b0bb 14 'Sören Schulze',
2770590d
GV
15 )
16
2c8d32de 17__license__ = 'Public Domain'
93e16595 18__version__ = '2011.09.13'
2770590d 19
d207e7cf
PH
20UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
21
80066952 22import cookielib
a1f03c7b 23import datetime
1987c232 24import gzip
4fa74b52
RG
25import htmlentitydefs
26import httplib
2546e767 27import locale
4fa74b52
RG
28import math
29import netrc
30import os
31import os.path
32import re
33import socket
34import string
0487b407 35import subprocess
4fa74b52
RG
36import sys
37import time
38import urllib
39import urllib2
c6b55a8d 40import warnings
1987c232 41import zlib
a04e80a4 42
0a3c8b62
PH
43if os.name == 'nt':
44 import ctypes
45
46try:
47 import email.utils
48except ImportError: # Python 2.4
49 import email.Utils
c6b55a8d
PH
50try:
51 import cStringIO as StringIO
52except ImportError:
53 import StringIO
54
a04e80a4
RG
55# parse_qs was moved from the cgi module to the urlparse module recently.
56try:
57 from urlparse import parse_qs
58except ImportError:
59 from cgi import parse_qs
4fa74b52 60
c6b55a8d
PH
61try:
62 import lxml.etree
2b70537d 63except ImportError:
c6b55a8d
PH
64 pass # Handled below
65
c8e30044
PH
66try:
67 import xml.etree.ElementTree
68except ImportError: # Python<2.5
69 pass # Not officially supported, but let it slip
70
f995f712 71std_headers = {
c44b9ee9 72 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
4fa74b52 73 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 74 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
a57ed21f 75 'Accept-Encoding': 'gzip, deflate',
4fa74b52
RG
76 'Accept-Language': 'en-us,en;q=0.5',
77}
78
79simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
80
437d76c1
PH
81try:
82 import json
91e6a385 83except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
437d76c1
PH
84 import re
85 class json(object):
86 @staticmethod
87 def loads(s):
88 s = s.decode('UTF-8')
89 def raiseError(msg, i):
90 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
91 def skipSpace(i, expectMore=True):
92 while i < len(s) and s[i] in ' \t\r\n':
93 i += 1
94 if expectMore:
95 if i >= len(s):
96 raiseError('Premature end', i)
97 return i
98 def decodeEscape(match):
99 esc = match.group(1)
100 _STATIC = {
101 '"': '"',
102 '\\': '\\',
103 '/': '/',
104 'b': unichr(0x8),
105 'f': unichr(0xc),
106 'n': '\n',
107 'r': '\r',
108 't': '\t',
109 }
110 if esc in _STATIC:
111 return _STATIC[esc]
112 if esc[0] == 'u':
113 if len(esc) == 1+4:
114 return unichr(int(esc[1:5], 16))
115 if len(esc) == 5+6 and esc[5:7] == '\\u':
116 hi = int(esc[1:5], 16)
117 low = int(esc[7:11], 16)
118 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
119 raise ValueError('Unknown escape ' + str(esc))
120 def parseString(i):
121 i += 1
122 e = i
123 while True:
124 e = s.index('"', e)
125 bslashes = 0
126 while s[e-bslashes-1] == '\\':
127 bslashes += 1
128 if bslashes % 2 == 1:
129 e += 1
130 continue
131 break
132 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
133 stri = rexp.sub(decodeEscape, s[i:e])
134 return (e+1,stri)
135 def parseObj(i):
136 i += 1
137 res = {}
138 i = skipSpace(i)
139 if s[i] == '}': # Empty dictionary
140 return (i+1,res)
141 while True:
142 if s[i] != '"':
143 raiseError('Expected a string object key', i)
144 i,key = parseString(i)
145 i = skipSpace(i)
146 if i >= len(s) or s[i] != ':':
147 raiseError('Expected a colon', i)
148 i,val = parse(i+1)
149 res[key] = val
150 i = skipSpace(i)
151 if s[i] == '}':
152 return (i+1, res)
153 if s[i] != ',':
154 raiseError('Expected comma or closing curly brace', i)
155 i = skipSpace(i+1)
156 def parseArray(i):
157 res = []
158 i = skipSpace(i+1)
159 if s[i] == ']': # Empty array
160 return (i+1,res)
161 while True:
162 i,val = parse(i)
163 res.append(val)
164 i = skipSpace(i) # Raise exception if premature end
165 if s[i] == ']':
166 return (i+1, res)
167 if s[i] != ',':
168 raiseError('Expected a comma or closing bracket', i)
169 i = skipSpace(i+1)
170 def parseDiscrete(i):
171 for k,v in {'true': True, 'false': False, 'null': None}.items():
172 if s.startswith(k, i):
173 return (i+len(k), v)
174 raiseError('Not a boolean (or null)', i)
175 def parseNumber(i):
176 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
177 if mobj is None:
178 raiseError('Not a number', i)
179 nums = mobj.group(1)
180 if '.' in nums or 'e' in nums or 'E' in nums:
181 return (i+len(nums), float(nums))
182 return (i+len(nums), int(nums))
183 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
184 def parse(i):
185 i = skipSpace(i)
186 i,res = CHARMAP.get(s[i], parseNumber)(i)
187 i = skipSpace(i, False)
188 return (i,res)
189 i,res = parse(0)
190 if i < len(s):
191 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
192 return res
193
eae2666c
RG
194def preferredencoding():
195 """Get preferred encoding.
196
197 Returns the best encoding scheme for the system, based on
198 locale.getpreferredencoding() and some further tweaks.
199 """
f94b636c
RG
200 def yield_preferredencoding():
201 try:
202 pref = locale.getpreferredencoding()
203 u'TEST'.encode(pref)
204 except:
205 pref = 'UTF-8'
206 while True:
207 yield pref
208 return yield_preferredencoding().next()
eae2666c 209
c0a10ca8 210
490fd7ae
RG
211def htmlentity_transform(matchobj):
212 """Transforms an HTML entity to a Unicode character.
d3975459 213
490fd7ae
RG
214 This function receives a match object and is intended to be used with
215 the re.sub() function.
216 """
217 entity = matchobj.group(1)
218
219 # Known non-numeric HTML entity
220 if entity in htmlentitydefs.name2codepoint:
221 return unichr(htmlentitydefs.name2codepoint[entity])
222
223 # Unicode character
224 mobj = re.match(ur'(?u)#(x?\d+)', entity)
225 if mobj is not None:
226 numstr = mobj.group(1)
227 if numstr.startswith(u'x'):
228 base = 16
229 numstr = u'0%s' % numstr
230 else:
231 base = 10
232 return unichr(long(numstr, base))
233
234 # Unknown entity in name, return its literal representation
235 return (u'&%s;' % entity)
236
c0a10ca8 237
490fd7ae 238def sanitize_title(utitle):
31bcb480 239 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 240 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
241 return utitle.replace(unicode(os.sep), u'%')
242
c0a10ca8 243
31bcb480
RG
244def sanitize_open(filename, open_mode):
245 """Try to open the given filename, and slightly tweak it if this fails.
246
247 Attempts to open the given filename. If this fails, it tries to change
248 the filename slightly, step by step, until it's either able to open it
249 or it fails and raises a final exception, like the standard open()
250 function.
251
252 It returns the tuple (stream, definitive_file_name).
253 """
254 try:
131bc765 255 if filename == u'-':
e08878f4
RG
256 if sys.platform == 'win32':
257 import msvcrt
258 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 259 return (sys.stdout, filename)
31bcb480
RG
260 stream = open(filename, open_mode)
261 return (stream, filename)
262 except (IOError, OSError), err:
263 # In case of error, try to remove win32 forbidden chars
ca6a11fa 264 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
265
266 # An exception here should be caught in the caller
267 stream = open(filename, open_mode)
268 return (stream, filename)
269
c0a10ca8 270
09bd408c 271def timeconvert(timestr):
c0a10ca8
F
272 """Convert RFC 2822 defined time string into system timestamp"""
273 timestamp = None
274 timetuple = email.utils.parsedate_tz(timestr)
275 if timetuple is not None:
276 timestamp = email.utils.mktime_tz(timetuple)
277 return timestamp
278
09bd408c 279
e5bf0f55
RG
280class DownloadError(Exception):
281 """Download Error exception.
d3975459 282
e5bf0f55
RG
283 This exception may be thrown by FileDownloader objects if they are not
284 configured to continue on errors. They will contain the appropriate
285 error message.
286 """
287 pass
288
c0a10ca8 289
e5bf0f55
RG
290class SameFileError(Exception):
291 """Same File exception.
292
293 This exception will be thrown by FileDownloader objects if they detect
294 multiple files would have to be downloaded to the same file on disk.
295 """
296 pass
297
c0a10ca8 298
65cd34c5
RG
299class PostProcessingError(Exception):
300 """Post Processing exception.
301
302 This exception may be raised by PostProcessor's .run() method to
303 indicate an error in the postprocessing task.
304 """
305 pass
306
c0a10ca8 307
73f4e7af 308class UnavailableVideoError(Exception):
7b7759f5 309 """Unavailable Format exception.
310
311 This exception will be thrown when a video is requested
312 in a format that is not available for that video.
313 """
d69a1c91
RG
314 pass
315
c0a10ca8 316
d69a1c91
RG
317class ContentTooShortError(Exception):
318 """Content Too Short exception.
319
320 This exception may be raised by FileDownloader objects when a file they
321 download is too small for what the server announced first, indicating
322 the connection was probably interrupted.
323 """
324 # Both in bytes
325 downloaded = None
326 expected = None
327
328 def __init__(self, downloaded, expected):
329 self.downloaded = downloaded
330 self.expected = expected
7b7759f5 331
c0a10ca8 332
1987c232
RG
333class YoutubeDLHandler(urllib2.HTTPHandler):
334 """Handler for HTTP requests and responses.
335
336 This class, when installed with an OpenerDirector, automatically adds
337 the standard headers to every HTTP request and handles gzipped and
338 deflated responses from web servers. If compression is to be avoided in
339 a particular request, the original request in the program code only has
340 to include the HTTP header "Youtubedl-No-Compression", which will be
341 removed before making the real request.
c0a10ca8 342
1987c232
RG
343 Part of this code was copied from:
344
c0a10ca8
F
345 http://techknack.net/python-urllib2-handlers/
346
1987c232
RG
347 Andrew Rowls, the author of that code, agreed to release it to the
348 public domain.
349 """
350
351 @staticmethod
352 def deflate(data):
353 try:
354 return zlib.decompress(data, -zlib.MAX_WBITS)
355 except zlib.error:
356 return zlib.decompress(data)
c0a10ca8 357
7b531c0b
RG
358 @staticmethod
359 def addinfourl_wrapper(stream, headers, url, code):
360 if hasattr(urllib2.addinfourl, 'getcode'):
361 return urllib2.addinfourl(stream, headers, url, code)
0f6b00b5
RG
362 ret = urllib2.addinfourl(stream, headers, url)
363 ret.code = code
364 return ret
c0a10ca8 365
1987c232
RG
366 def http_request(self, req):
367 for h in std_headers:
368 if h in req.headers:
369 del req.headers[h]
370 req.add_header(h, std_headers[h])
371 if 'Youtubedl-no-compression' in req.headers:
372 if 'Accept-encoding' in req.headers:
373 del req.headers['Accept-encoding']
374 del req.headers['Youtubedl-no-compression']
375 return req
376
377 def http_response(self, req, resp):
378 old_resp = resp
379 # gzip
380 if resp.headers.get('Content-encoding', '') == 'gzip':
381 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
7b531c0b 382 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
383 resp.msg = old_resp.msg
384 # deflate
385 if resp.headers.get('Content-encoding', '') == 'deflate':
386 gz = StringIO.StringIO(self.deflate(resp.read()))
7b531c0b 387 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
388 resp.msg = old_resp.msg
389 return resp
390
c0a10ca8 391
4fa74b52
RG
392class FileDownloader(object):
393 """File Downloader class.
394
395 File downloader objects are the ones responsible of downloading the
396 actual video file and writing it to disk if the user has requested
397 it, among some other tasks. In most cases there should be one per
398 program. As, given a video URL, the downloader doesn't know how to
399 extract all the needed information, task that InfoExtractors do, it
400 has to pass the URL to one of them.
401
402 For this, file downloader objects have a method that allows
403 InfoExtractors to be registered in a given order. When it is passed
404 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
405 finds that reports being able to handle it. The InfoExtractor extracts
406 all the information about the video or videos the URL refers to, and
407 asks the FileDownloader to process the video information, possibly
408 downloading the video.
4fa74b52
RG
409
410 File downloaders accept a lot of parameters. In order not to saturate
411 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
412 options instead. These options are available through the params
413 attribute for the InfoExtractors to use. The FileDownloader also
414 registers itself as the downloader in charge for the InfoExtractors
415 that are added to it, so this is a "mutual registration".
4fa74b52
RG
416
417 Available options:
418
80066952
RG
419 username: Username for authentication purposes.
420 password: Password for authentication purposes.
421 usenetrc: Use netrc for authentication instead.
422 quiet: Do not print messages to stdout.
423 forceurl: Force printing final URL.
424 forcetitle: Force printing title.
425 forcethumbnail: Force printing thumbnail URL.
426 forcedescription: Force printing description.
9f796346 427 forcefilename: Force printing final filename.
80066952
RG
428 simulate: Do not download the video files.
429 format: Video format code.
430 format_limit: Highest quality format to try.
431 outtmpl: Template for output names.
432 ignoreerrors: Do not stop on download errors.
433 ratelimit: Download speed limit, in bytes/sec.
434 nooverwrites: Prevent overwriting files.
435 retries: Number of times to retry for HTTP error 5xx
436 continuedl: Try to continue downloads if possible.
437 noprogress: Do not print the progress bar.
438 playliststart: Playlist item to start at.
8cc44341 439 playlistend: Playlist item to end at.
331ce0a0 440 logtostderr: Log messages to stderr instead of stdout.
ccbd296b 441 consoletitle: Display progress in console window's titlebar.
3fb2c487 442 nopart: Do not use temporary .part files.
e3018902 443 updatetime: Use the Last-modified header to set output file timestamps.
8b95c387 444 writedescription: Write the video description to a .description file
6eb08fbf 445 writeinfojson: Write the video description to a .info.json file
4fa74b52
RG
446 """
447
d0a9affb 448 params = None
4fa74b52 449 _ies = []
65cd34c5 450 _pps = []
9bf386d7 451 _download_retcode = None
7d8d0612 452 _num_downloads = None
331ce0a0 453 _screen_file = None
4fa74b52
RG
454
455 def __init__(self, params):
1c5e2302 456 """Create a FileDownloader object with the given options."""
4fa74b52 457 self._ies = []
65cd34c5 458 self._pps = []
9bf386d7 459 self._download_retcode = 0
7d8d0612 460 self._num_downloads = 0
331ce0a0 461 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 462 self.params = params
d3975459 463
4fa74b52
RG
464 @staticmethod
465 def format_bytes(bytes):
466 if bytes is None:
467 return 'N/A'
8497c36d
RG
468 if type(bytes) is str:
469 bytes = float(bytes)
470 if bytes == 0.0:
4fa74b52
RG
471 exponent = 0
472 else:
8497c36d 473 exponent = long(math.log(bytes, 1024.0))
4fa74b52 474 suffix = 'bkMGTPEZY'[exponent]
c0a10ca8 475 converted = float(bytes) / float(1024 ** exponent)
4fa74b52
RG
476 return '%.2f%s' % (converted, suffix)
477
478 @staticmethod
479 def calc_percent(byte_counter, data_len):
480 if data_len is None:
481 return '---.-%'
482 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
483
484 @staticmethod
485 def calc_eta(start, now, total, current):
486 if total is None:
487 return '--:--'
488 dif = now - start
489 if current == 0 or dif < 0.001: # One millisecond
490 return '--:--'
491 rate = float(current) / dif
492 eta = long((float(total) - float(current)) / rate)
493 (eta_mins, eta_secs) = divmod(eta, 60)
494 if eta_mins > 99:
495 return '--:--'
496 return '%02d:%02d' % (eta_mins, eta_secs)
497
5121ef20 498 @staticmethod
4fa74b52
RG
499 def calc_speed(start, now, bytes):
500 dif = now - start
501 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 502 return '%10s' % '---b/s'
4fa74b52
RG
503 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
504
505 @staticmethod
506 def best_block_size(elapsed_time, bytes):
507 new_min = max(bytes / 2.0, 1.0)
508 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
509 if elapsed_time < 0.001:
e1f18b8a 510 return long(new_max)
4fa74b52
RG
511 rate = bytes / elapsed_time
512 if rate > new_max:
e1f18b8a 513 return long(new_max)
4fa74b52 514 if rate < new_min:
e1f18b8a
RG
515 return long(new_min)
516 return long(rate)
4fa74b52 517
acd3d842
RG
518 @staticmethod
519 def parse_bytes(bytestr):
520 """Parse a string indicating a byte quantity into a long integer."""
521 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
522 if matchobj is None:
523 return None
524 number = float(matchobj.group(1))
525 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
526 return long(round(number * multiplier))
527
4fa74b52
RG
528 def add_info_extractor(self, ie):
529 """Add an InfoExtractor object to the end of the list."""
530 self._ies.append(ie)
531 ie.set_downloader(self)
d3975459 532
65cd34c5
RG
533 def add_post_processor(self, pp):
534 """Add a PostProcessor object to the end of the chain."""
535 self._pps.append(pp)
536 pp.set_downloader(self)
d3975459 537
331ce0a0 538 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 539 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
540 try:
541 if not self.params.get('quiet', False):
331ce0a0
RG
542 terminator = [u'\n', u''][skip_eol]
543 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
544 self._screen_file.flush()
43ab0ca4
RG
545 except (UnicodeEncodeError), err:
546 if not ignore_encoding_errors:
547 raise
d3975459 548
7e5cab67
RG
549 def to_stderr(self, message):
550 """Print message to stderr."""
eae2666c 551 print >>sys.stderr, message.encode(preferredencoding())
d3975459 552
ccbd296b
MM
553 def to_cons_title(self, message):
554 """Set console/terminal window title to message."""
555 if not self.params.get('consoletitle', False):
556 return
557 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
558 # c_wchar_p() might not be necessary if `message` is
559 # already of type unicode()
560 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
561 elif 'TERM' in os.environ:
562 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
563
22899cea
RG
564 def fixed_template(self):
565 """Checks if the output template is fixed."""
d0a9affb 566 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 567
0086d1ec
RG
568 def trouble(self, message=None):
569 """Determine action to take when a download problem appears.
570
571 Depending on if the downloader has been configured to ignore
e5bf0f55 572 download errors or not, this method may throw an exception or
9bf386d7 573 not when errors are found, after printing the message.
0086d1ec
RG
574 """
575 if message is not None:
576 self.to_stderr(message)
d0a9affb 577 if not self.params.get('ignoreerrors', False):
e5bf0f55 578 raise DownloadError(message)
9bf386d7 579 self._download_retcode = 1
0086d1ec 580
acd3d842
RG
581 def slow_down(self, start_time, byte_counter):
582 """Sleep if the download speed is over the rate limit."""
d0a9affb 583 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
584 if rate_limit is None or byte_counter == 0:
585 return
586 now = time.time()
587 elapsed = now - start_time
588 if elapsed <= 0.0:
589 return
590 speed = float(byte_counter) / elapsed
591 if speed > rate_limit:
592 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
3fb2c487
RG
593
594 def temp_name(self, filename):
595 """Returns a temporary filename for the given filename."""
596 if self.params.get('nopart', False) or filename == u'-' or \
597 (os.path.exists(filename) and not os.path.isfile(filename)):
598 return filename
599 return filename + u'.part'
600
8cc42e7c
RG
601 def undo_temp_name(self, filename):
602 if filename.endswith(u'.part'):
603 return filename[:-len(u'.part')]
604 return filename
605
62cf7aaf
RG
606 def try_rename(self, old_filename, new_filename):
607 try:
7d950ca1
RG
608 if old_filename == new_filename:
609 return
62cf7aaf
RG
610 os.rename(old_filename, new_filename)
611 except (IOError, OSError), err:
612 self.trouble(u'ERROR: unable to rename file')
c0a10ca8 613
e3018902
RG
614 def try_utime(self, filename, last_modified_hdr):
615 """Try to set the last-modified time of the given file."""
616 if last_modified_hdr is None:
617 return
618 if not os.path.isfile(filename):
619 return
620 timestr = last_modified_hdr
621 if timestr is None:
622 return
623 filetime = timeconvert(timestr)
624 if filetime is None:
625 return
626 try:
c0a10ca8 627 os.utime(filename, (time.time(), filetime))
e3018902
RG
628 except:
629 pass
acd3d842 630
8b95c387 631 def report_writedescription(self, descfn):
6eb08fbf
PH
632 """ Report that the description file is being written """
633 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
634
635 def report_writeinfojson(self, infofn):
636 """ Report that the metadata file has been written """
637 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
8b95c387 638
bafa5cd9
RG
639 def report_destination(self, filename):
640 """Report destination filename."""
331ce0a0 641 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
d3975459 642
bafa5cd9
RG
643 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
644 """Report download progress."""
d9835247
RG
645 if self.params.get('noprogress', False):
646 return
331ce0a0 647 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 648 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
ccbd296b
MM
649 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
650 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
7db85b2c
RG
651
652 def report_resuming_byte(self, resume_len):
8a9f53be 653 """Report attempt to resume at given byte."""
331ce0a0 654 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
d3975459 655
7031008c 656 def report_retry(self, count, retries):
e86e9474 657 """Report retry in case of HTTP error 5xx"""
331ce0a0 658 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
d3975459 659
7db85b2c
RG
660 def report_file_already_downloaded(self, file_name):
661 """Report file has already been fully downloaded."""
43ab0ca4 662 try:
331ce0a0 663 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 664 except (UnicodeEncodeError), err:
331ce0a0 665 self.to_screen(u'[download] The file has already been downloaded')
d3975459 666
7db85b2c
RG
667 def report_unable_to_resume(self):
668 """Report it was impossible to resume download."""
331ce0a0 669 self.to_screen(u'[download] Unable to resume')
d3975459 670
bafa5cd9
RG
671 def report_finish(self):
672 """Report download finished."""
d9835247 673 if self.params.get('noprogress', False):
331ce0a0 674 self.to_screen(u'[download] Download completed')
d9835247 675 else:
331ce0a0 676 self.to_screen(u'')
d3975459 677
df372a65
RG
678 def increment_downloads(self):
679 """Increment the ordinal that assigns a number to each file."""
680 self._num_downloads += 1
bafa5cd9 681
9f796346
GI
682 def prepare_filename(self, info_dict):
683 """Generate the output filename."""
684 try:
685 template_dict = dict(info_dict)
686 template_dict['epoch'] = unicode(long(time.time()))
687 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
688 filename = self.params['outtmpl'] % template_dict
689 return filename
690 except (ValueError, KeyError), err:
691 self.trouble(u'ERROR: invalid system charset or erroneous output template')
692 return None
693
c8619e01
RG
694 def process_info(self, info_dict):
695 """Process a single dictionary returned by an InfoExtractor."""
9f796346 696 filename = self.prepare_filename(info_dict)
c8619e01
RG
697 # Do nothing else if in simulate mode
698 if self.params.get('simulate', False):
cbfff4db
RG
699 # Forced printings
700 if self.params.get('forcetitle', False):
490fd7ae 701 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 702 if self.params.get('forceurl', False):
490fd7ae 703 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
7e58d568
RG
704 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
705 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
706 if self.params.get('forcedescription', False) and 'description' in info_dict:
707 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
9f796346
GI
708 if self.params.get('forcefilename', False) and filename is not None:
709 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 710
9bf386d7 711 return
d3975459 712
9f796346 713 if filename is None:
38ed1344 714 return
850ab765 715 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 716 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 717 return
7b7759f5 718
c8619e01 719 try:
e5e74ffb
PH
720 dn = os.path.dirname(filename)
721 if dn != '' and not os.path.exists(dn):
722 os.makedirs(dn)
c8619e01 723 except (OSError, IOError), err:
cec3a53c 724 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
9bf386d7 725 return
7b7759f5 726
8b95c387
PH
727 if self.params.get('writedescription', False):
728 try:
729 descfn = filename + '.description'
6eb08fbf 730 self.report_writedescription(descfn)
1293ce58
PH
731 descfile = open(descfn, 'wb')
732 try:
8b95c387 733 descfile.write(info_dict['description'].encode('utf-8'))
1293ce58
PH
734 finally:
735 descfile.close()
8b95c387 736 except (OSError, IOError):
cec3a53c 737 self.trouble(u'ERROR: Cannot write description file ' + descfn)
8b95c387
PH
738 return
739
6eb08fbf
PH
740 if self.params.get('writeinfojson', False):
741 infofn = filename + '.info.json'
742 self.report_writeinfojson(infofn)
743 try:
744 json.dump
745 except (NameError,AttributeError):
746 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
747 return
748 try:
1293ce58
PH
749 infof = open(infofn, 'wb')
750 try:
6eb08fbf 751 json.dump(info_dict, infof)
1293ce58
PH
752 finally:
753 infof.close()
6eb08fbf 754 except (OSError, IOError):
cec3a53c 755 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
6eb08fbf
PH
756 return
757
c8619e01 758 try:
e616ec0c 759 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
c8619e01 760 except (OSError, IOError), err:
73f4e7af 761 raise UnavailableVideoError
c8619e01 762 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
db7e31b8 763 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
9bf386d7 764 return
d69a1c91 765 except (ContentTooShortError, ), err:
db7e31b8 766 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
d69a1c91 767 return
7b7759f5 768
55e7c75e
RG
769 if success:
770 try:
771 self.post_process(filename, info_dict)
772 except (PostProcessingError), err:
db7e31b8 773 self.trouble(u'ERROR: postprocessing: %s' % str(err))
55e7c75e 774 return
c8619e01 775
4fa74b52
RG
776 def download(self, url_list):
777 """Download a given list of URLs."""
22899cea 778 if len(url_list) > 1 and self.fixed_template():
d0a9affb 779 raise SameFileError(self.params['outtmpl'])
22899cea 780
4fa74b52
RG
781 for url in url_list:
782 suitable_found = False
783 for ie in self._ies:
c8619e01 784 # Go to next InfoExtractor if not suitable
4fa74b52
RG
785 if not ie.suitable(url):
786 continue
c8619e01 787
4fa74b52
RG
788 # Suitable InfoExtractor found
789 suitable_found = True
c8619e01 790
6f21f686
RG
791 # Extract information from URL and process it
792 ie.extract(url)
65cd34c5 793
c8619e01 794 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 795 break
c8619e01 796
4fa74b52 797 if not suitable_found:
db7e31b8 798 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 799
9bf386d7 800 return self._download_retcode
65cd34c5
RG
801
802 def post_process(self, filename, ie_info):
803 """Run the postprocessing chain on the given file."""
804 info = dict(ie_info)
805 info['filepath'] = filename
806 for pp in self._pps:
807 info = pp.run(info)
808 if info is None:
809 break
d3975459 810
e616ec0c 811 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407 812 self.report_destination(filename)
62cf7aaf 813 tmpfilename = self.temp_name(filename)
0487b407
RG
814
815 # Check for rtmpdump first
816 try:
817 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
818 except (OSError, IOError):
819 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
820 return False
821
822 # Download using rtmpdump. rtmpdump returns exit code 2 when
823 # the connection was interrumpted and resuming appears to be
824 # possible. This is part of rtmpdump's normal usage, AFAIK.
c8e30044 825 basic_args = ['rtmpdump'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
1c1821f8
RG
826 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
827 while retval == 2 or retval == 1:
62cf7aaf 828 prevsize = os.path.getsize(tmpfilename)
331ce0a0 829 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 830 time.sleep(5.0) # This seems to be needed
1c1821f8 831 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
62cf7aaf 832 cursize = os.path.getsize(tmpfilename)
e616ec0c
RG
833 if prevsize == cursize and retval == 1:
834 break
0487b407 835 if retval == 0:
62cf7aaf
RG
836 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
837 self.try_rename(tmpfilename, filename)
0487b407
RG
838 return True
839 else:
db7e31b8 840 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
841 return False
842
e616ec0c 843 def _do_download(self, filename, url, player_url):
62cf7aaf 844 # Check file already present
3fb2c487 845 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
62cf7aaf
RG
846 self.report_file_already_downloaded(filename)
847 return True
848
0487b407
RG
849 # Attempt to download using rtmpdump
850 if url.startswith('rtmp'):
e616ec0c 851 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 852
62cf7aaf 853 tmpfilename = self.temp_name(filename)
55e7c75e 854 stream = None
9c457d2a 855 open_mode = 'wb'
1987c232
RG
856
857 # Do not include the Accept-Encoding header
858 headers = {'Youtubedl-no-compression': 'True'}
859 basic_request = urllib2.Request(url, None, headers)
860 request = urllib2.Request(url, None, headers)
7db85b2c 861
9c457d2a 862 # Establish possible resume length
62cf7aaf
RG
863 if os.path.isfile(tmpfilename):
864 resume_len = os.path.getsize(tmpfilename)
55e7c75e
RG
865 else:
866 resume_len = 0
9c457d2a
RG
867
868 # Request parameters in case of being able to resume
850ab765 869 if self.params.get('continuedl', False) and resume_len != 0:
7db85b2c 870 self.report_resuming_byte(resume_len)
c0a10ca8 871 request.add_header('Range', 'bytes=%d-' % resume_len)
9c457d2a 872 open_mode = 'ab'
55e7c75e 873
7031008c
RG
874 count = 0
875 retries = self.params.get('retries', 0)
101e0d1e 876 while count <= retries:
7031008c
RG
877 # Establish connection
878 try:
879 data = urllib2.urlopen(request)
880 break
881 except (urllib2.HTTPError, ), err:
ac249f42 882 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 883 # Unexpected HTTP error
7031008c 884 raise
101e0d1e
RG
885 elif err.code == 416:
886 # Unable to resume (requested range not satisfiable)
887 try:
888 # Open the connection again without the range header
889 data = urllib2.urlopen(basic_request)
890 content_length = data.info()['Content-Length']
891 except (urllib2.HTTPError, ), err:
ac249f42 892 if err.code < 500 or err.code >= 600:
101e0d1e
RG
893 raise
894 else:
895 # Examine the reported length
268fb2bd 896 if (content_length is not None and
c0a10ca8 897 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
898 # The file had already been fully downloaded.
899 # Explanation to the above condition: in issue #175 it was revealed that
900 # YouTube sometimes adds or removes a few bytes from the end of the file,
901 # changing the file size slightly and causing problems for some users. So
902 # I decided to implement a suggested change and consider the file
903 # completely downloaded if the file size differs less than 100 bytes from
904 # the one in the hard drive.
101e0d1e 905 self.report_file_already_downloaded(filename)
62cf7aaf 906 self.try_rename(tmpfilename, filename)
101e0d1e
RG
907 return True
908 else:
909 # The length does not match, we start the download over
910 self.report_unable_to_resume()
911 open_mode = 'wb'
912 break
913 # Retry
914 count += 1
915 if count <= retries:
916 self.report_retry(count, retries)
917
918 if count > retries:
919 self.trouble(u'ERROR: giving up after %s retries' % retries)
920 return False
7db85b2c 921
4fa74b52 922 data_len = data.info().get('Content-length', None)
106d091e
RG
923 if data_len is not None:
924 data_len = long(data_len) + resume_len
4fa74b52 925 data_len_str = self.format_bytes(data_len)
106d091e 926 byte_counter = 0 + resume_len
4fa74b52
RG
927 block_size = 1024
928 start = time.time()
929 while True:
bafa5cd9 930 # Download and write
4fa74b52
RG
931 before = time.time()
932 data_block = data.read(block_size)
933 after = time.time()
975a91d0 934 if len(data_block) == 0:
4fa74b52 935 break
975a91d0 936 byte_counter += len(data_block)
55e7c75e
RG
937
938 # Open file just in time
939 if stream is None:
940 try:
62cf7aaf 941 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
dbddab27 942 assert stream is not None
8cc42e7c 943 filename = self.undo_temp_name(tmpfilename)
55e7c75e
RG
944 self.report_destination(filename)
945 except (OSError, IOError), err:
db7e31b8 946 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 947 return False
131efd1a
RG
948 try:
949 stream.write(data_block)
950 except (IOError, OSError), err:
d67e0974
RG
951 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
952 return False
975a91d0 953 block_size = self.best_block_size(after - before, len(data_block))
4fa74b52 954
55e7c75e
RG
955 # Progress message
956 percent_str = self.calc_percent(byte_counter, data_len)
975a91d0
RG
957 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
958 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
55e7c75e
RG
959 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
960
acd3d842 961 # Apply rate limit
975a91d0 962 self.slow_down(start, byte_counter - resume_len)
acd3d842 963
dbddab27
PH
964 if stream is None:
965 self.trouble(u'\nERROR: Did not get any data blocks')
966 return False
6f0ff3ba 967 stream.close()
bafa5cd9 968 self.report_finish()
b905e5f5 969 if data_len is not None and byte_counter != data_len:
d69a1c91 970 raise ContentTooShortError(byte_counter, long(data_len))
62cf7aaf 971 self.try_rename(tmpfilename, filename)
e3018902 972
09bd408c 973 # Update file modification time
e3018902
RG
974 if self.params.get('updatetime', True):
975 self.try_utime(filename, data.info().get('last-modified', None))
976
55e7c75e 977 return True
4fa74b52 978
c0a10ca8 979
4fa74b52
RG
980class InfoExtractor(object):
981 """Information Extractor class.
982
983 Information extractors are the classes that, given a URL, extract
984 information from the video (or videos) the URL refers to. This
985 information includes the real video URL, the video title and simplified
2851b2ca
RG
986 title, author and others. The information is stored in a dictionary
987 which is then passed to the FileDownloader. The FileDownloader
988 processes this information possibly downloading the video to the file
989 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
990 the following fields:
991
992 id: Video identifier.
993 url: Final video URL.
994 uploader: Nickname of the video uploader.
995 title: Literal title.
996 stitle: Simplified title.
997 ext: Video filename extension.
6ba562b0 998 format: Video format.
e616ec0c 999 player_url: SWF Player URL (may be None).
4fa74b52 1000
7e58d568
RG
1001 The following fields are optional. Their primary purpose is to allow
1002 youtube-dl to serve as the backend for a video search function, such
1003 as the one in youtube2mp3. They are only used when their respective
1004 forced printing functions are called:
1005
1006 thumbnail: Full URL to a video thumbnail image.
1007 description: One-line video description.
1008
4fa74b52
RG
1009 Subclasses of this one should re-define the _real_initialize() and
1010 _real_extract() methods, as well as the suitable() static method.
1011 Probably, they should also be instantiated and added to the main
1012 downloader.
1013 """
1014
1015 _ready = False
1016 _downloader = None
1017
1018 def __init__(self, downloader=None):
1019 """Constructor. Receives an optional downloader."""
1020 self._ready = False
1021 self.set_downloader(downloader)
1022
1023 @staticmethod
1024 def suitable(url):
1025 """Receives a URL and returns True if suitable for this IE."""
020f7150 1026 return False
4fa74b52
RG
1027
1028 def initialize(self):
1c5e2302 1029 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
1030 if not self._ready:
1031 self._real_initialize()
1032 self._ready = True
1033
1034 def extract(self, url):
1035 """Extracts URL information and returns it in list of dicts."""
1036 self.initialize()
1037 return self._real_extract(url)
1038
1039 def set_downloader(self, downloader):
1040 """Sets the downloader for this IE."""
1041 self._downloader = downloader
d3975459 1042
4fa74b52
RG
1043 def _real_initialize(self):
1044 """Real initialization process. Redefine in subclasses."""
1045 pass
1046
1047 def _real_extract(self, url):
1048 """Real extraction process. Redefine in subclasses."""
1049 pass
1050
c0a10ca8 1051
4fa74b52
RG
1052class YoutubeIE(InfoExtractor):
1053 """Information extractor for youtube.com."""
1054
86e709d3 1055 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 1056 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 1057 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 1058 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 1059 _NETRC_MACHINE = 'youtube'
497cd3e6 1060 # Listed in order of quality
e0edf1e0 1061 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
7b7759f5 1062 _video_extensions = {
1063 '13': '3gp',
1064 '17': 'mp4',
1065 '18': 'mp4',
1066 '22': 'mp4',
d9bc015b 1067 '37': 'mp4',
9e9647d9 1068 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a
RG
1069 '43': 'webm',
1070 '45': 'webm',
7b7759f5 1071 }
4fa74b52 1072
020f7150
RG
1073 @staticmethod
1074 def suitable(url):
1075 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1076
72ac78b8
RG
1077 def report_lang(self):
1078 """Report attempt to set language."""
331ce0a0 1079 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 1080
bafa5cd9
RG
1081 def report_login(self):
1082 """Report attempt to log in."""
331ce0a0 1083 self._downloader.to_screen(u'[youtube] Logging in')
d3975459 1084
bafa5cd9
RG
1085 def report_age_confirmation(self):
1086 """Report attempt to confirm age."""
331ce0a0 1087 self._downloader.to_screen(u'[youtube] Confirming age')
d3975459 1088
e616ec0c
RG
1089 def report_video_webpage_download(self, video_id):
1090 """Report attempt to download video webpage."""
331ce0a0 1091 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
d3975459 1092
71b7300e
RG
1093 def report_video_info_webpage_download(self, video_id):
1094 """Report attempt to download video info webpage."""
331ce0a0 1095 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
d3975459 1096
bafa5cd9
RG
1097 def report_information_extraction(self, video_id):
1098 """Report attempt to extract video information."""
331ce0a0 1099 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
d3975459 1100
7b7759f5 1101 def report_unavailable_format(self, video_id, format):
1102 """Report extracted video URL."""
331ce0a0 1103 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
d3975459 1104
0487b407
RG
1105 def report_rtmp_download(self):
1106 """Indicate the download will use the RTMP protocol."""
331ce0a0 1107 self._downloader.to_screen(u'[youtube] RTMP download detected')
d3975459 1108
4fa74b52
RG
1109 def _real_initialize(self):
1110 if self._downloader is None:
1111 return
1112
1113 username = None
1114 password = None
d0a9affb 1115 downloader_params = self._downloader.params
4fa74b52
RG
1116
1117 # Attempt to use provided username and password or .netrc data
1118 if downloader_params.get('username', None) is not None:
1119 username = downloader_params['username']
1120 password = downloader_params['password']
1121 elif downloader_params.get('usenetrc', False):
1122 try:
1123 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1124 if info is not None:
1125 username = info[0]
1126 password = info[2]
1127 else:
1128 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1129 except (IOError, netrc.NetrcParseError), err:
6f21f686 1130 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
1131 return
1132
72ac78b8 1133 # Set language
1987c232 1134 request = urllib2.Request(self._LANG_URL)
72ac78b8
RG
1135 try:
1136 self.report_lang()
1137 urllib2.urlopen(request).read()
1138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1139 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
1140 return
1141
cc109403
RG
1142 # No authentication to be performed
1143 if username is None:
1144 return
1145
4fa74b52 1146 # Log in
9fcd8355
RG
1147 login_form = {
1148 'current_form': 'loginForm',
4fa74b52
RG
1149 'next': '/',
1150 'action_login': 'Log In',
1151 'username': username,
9fcd8355
RG
1152 'password': password,
1153 }
1987c232 1154 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
4fa74b52 1155 try:
bafa5cd9 1156 self.report_login()
4fa74b52
RG
1157 login_results = urllib2.urlopen(request).read()
1158 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 1159 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
1160 return
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1162 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52 1163 return
d3975459 1164
4fa74b52 1165 # Confirm age
9fcd8355
RG
1166 age_form = {
1167 'next_url': '/',
1168 'action_confirm': 'Confirm',
1169 }
1987c232 1170 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
4fa74b52 1171 try:
bafa5cd9 1172 self.report_age_confirmation()
4fa74b52
RG
1173 age_results = urllib2.urlopen(request).read()
1174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1175 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 1176 return
4fa74b52
RG
1177
1178 def _real_extract(self, url):
1179 # Extract video id from URL
020f7150 1180 mobj = re.match(self._VALID_URL, url)
4fa74b52 1181 if mobj is None:
147753eb 1182 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1183 return
4fa74b52
RG
1184 video_id = mobj.group(2)
1185
497cd3e6
RG
1186 # Get video webpage
1187 self.report_video_webpage_download(video_id)
1987c232 1188 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
497cd3e6
RG
1189 try:
1190 video_webpage = urllib2.urlopen(request).read()
1191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1193 return
968aa884 1194
497cd3e6 1195 # Attempt to extract SWF player URL
b620a5f8 1196 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
497cd3e6 1197 if mobj is not None:
b620a5f8 1198 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
497cd3e6
RG
1199 else:
1200 player_url = None
1201
1202 # Get video info
1203 self.report_video_info_webpage_download(video_id)
1204 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1205 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c0a10ca8 1206 % (video_id, el_type))
1987c232 1207 request = urllib2.Request(video_info_url)
e616ec0c 1208 try:
497cd3e6
RG
1209 video_info_webpage = urllib2.urlopen(request).read()
1210 video_info = parse_qs(video_info_webpage)
1211 if 'token' in video_info:
1212 break
e616ec0c 1213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 1214 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 1215 return
f95f29fd
RG
1216 if 'token' not in video_info:
1217 if 'reason' in video_info:
8e686771 1218 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
1219 else:
1220 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1221 return
1222
1223 # Start extracting information
497cd3e6
RG
1224 self.report_information_extraction(video_id)
1225
1226 # uploader
1227 if 'author' not in video_info:
1228 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1229 return
1230 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 1231
497cd3e6
RG
1232 # title
1233 if 'title' not in video_info:
1234 self._downloader.trouble(u'ERROR: unable to extract video title')
1235 return
1236 video_title = urllib.unquote_plus(video_info['title'][0])
1237 video_title = video_title.decode('utf-8')
1238 video_title = sanitize_title(video_title)
1239
1240 # simplified title
1241 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1242 simple_title = simple_title.strip(ur'_')
1243
1244 # thumbnail image
1245 if 'thumbnail_url' not in video_info:
1246 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1247 video_thumbnail = ''
1248 else: # don't panic if we can't find it
1249 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1250
b3a27b52
NA
1251 # upload date
1252 upload_date = u'NA'
3efa45c3 1253 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
b3a27b52 1254 if mobj is not None:
a1f03c7b 1255 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
87cbd213 1256 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
a1f03c7b
NA
1257 for expression in format_expressions:
1258 try:
1259 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1260 except:
1261 pass
b3a27b52 1262
497cd3e6 1263 # description
c6b55a8d
PH
1264 try:
1265 lxml.etree
1266 except NameError:
1267 video_description = u'No description available.'
8b95c387 1268 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
c6b55a8d
PH
1269 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1270 if mobj is not None:
1271 video_description = mobj.group(1).decode('utf-8')
1272 else:
1273 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1274 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1275 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
91e6a385 1276 # TODO use another parser
497cd3e6 1277
5ce7d172
RG
1278 # token
1279 video_token = urllib.unquote_plus(video_info['token'][0])
1280
497cd3e6 1281 # Decide which formats to download
f83ae781 1282 req_format = self._downloader.params.get('format', None)
2e3a32e4 1283
f137bef9
PH
1284 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1285 self.report_rtmp_download()
1286 video_url_list = [(None, video_info['conn'][0])]
f137bef9 1287 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
0ac22e4f 1288 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
8519c32d 1289 url_data = [parse_qs(uds) for uds in url_data_strs]
f137bef9 1290 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
8519c32d 1291 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
2b70537d 1292
497cd3e6
RG
1293 format_limit = self._downloader.params.get('format_limit', None)
1294 if format_limit is not None and format_limit in self._available_formats:
1295 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 1296 else:
497cd3e6
RG
1297 format_list = self._available_formats
1298 existing_formats = [x for x in format_list if x in url_map]
1299 if len(existing_formats) == 0:
1300 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 1301 return
f83ae781 1302 if req_format is None:
d157d259 1303 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
f83ae781 1304 elif req_format == '-1':
d157d259 1305 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
497cd3e6 1306 else:
5c132793
RG
1307 # Specific format
1308 if req_format not in url_map:
1309 self._downloader.trouble(u'ERROR: requested format not available')
1310 return
1311 video_url_list = [(req_format, url_map[req_format])] # Specific format
497cd3e6 1312 else:
f3dc18d8 1313 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
497cd3e6 1314 return
7b7759f5 1315
497cd3e6
RG
1316 for format_param, video_real_url in video_url_list:
1317 # At this point we have a new video
1318 self._downloader.increment_downloads()
1319
1320 # Extension
1321 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 1322
968aa884 1323 try:
7b7759f5 1324 # Process video information
1325 self._downloader.process_info({
1326 'id': video_id.decode('utf-8'),
1327 'url': video_real_url.decode('utf-8'),
1328 'uploader': video_uploader.decode('utf-8'),
138b11f3 1329 'upload_date': upload_date,
7b7759f5 1330 'title': video_title,
1331 'stitle': simple_title,
1332 'ext': video_extension.decode('utf-8'),
6ba562b0 1333 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568 1334 'thumbnail': video_thumbnail.decode('utf-8'),
c6b55a8d 1335 'description': video_description,
e616ec0c 1336 'player_url': player_url,
7b7759f5 1337 })
497cd3e6 1338 except UnavailableVideoError, err:
09cc744c 1339 self._downloader.trouble(u'\nERROR: unable to download video')
42bcd27d 1340
4fa74b52 1341
020f7150
RG
1342class MetacafeIE(InfoExtractor):
1343 """Information Extractor for metacafe.com."""
1344
1345 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 1346 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 1347 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150
RG
1348 _youtube_ie = None
1349
1350 def __init__(self, youtube_ie, downloader=None):
1351 InfoExtractor.__init__(self, downloader)
1352 self._youtube_ie = youtube_ie
1353
1354 @staticmethod
1355 def suitable(url):
1356 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1357
1358 def report_disclaimer(self):
1359 """Report disclaimer retrieval."""
331ce0a0 1360 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
1361
1362 def report_age_confirmation(self):
1363 """Report attempt to confirm age."""
331ce0a0 1364 self._downloader.to_screen(u'[metacafe] Confirming age')
d3975459 1365
020f7150
RG
1366 def report_download_webpage(self, video_id):
1367 """Report webpage download."""
331ce0a0 1368 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
d3975459 1369
020f7150
RG
1370 def report_extraction(self, video_id):
1371 """Report information extraction."""
331ce0a0 1372 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
1373
1374 def _real_initialize(self):
1375 # Retrieve disclaimer
1987c232 1376 request = urllib2.Request(self._DISCLAIMER)
020f7150
RG
1377 try:
1378 self.report_disclaimer()
1379 disclaimer = urllib2.urlopen(request).read()
1380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1381 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1382 return
1383
1384 # Confirm age
1385 disclaimer_form = {
2546e767 1386 'filters': '0',
020f7150
RG
1387 'submit': "Continue - I'm over 18",
1388 }
1987c232 1389 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
020f7150
RG
1390 try:
1391 self.report_age_confirmation()
1392 disclaimer = urllib2.urlopen(request).read()
1393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1394 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150 1395 return
d3975459 1396
020f7150
RG
1397 def _real_extract(self, url):
1398 # Extract id and simplified title from URL
1399 mobj = re.match(self._VALID_URL, url)
1400 if mobj is None:
147753eb 1401 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1402 return
020f7150
RG
1403
1404 video_id = mobj.group(1)
1405
1406 # Check if video comes from YouTube
1407 mobj2 = re.match(r'^yt-(.*)$', video_id)
1408 if mobj2 is not None:
6f21f686
RG
1409 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1410 return
020f7150 1411
df372a65 1412 # At this point we have a new video
9bf7fa52 1413 self._downloader.increment_downloads()
df372a65 1414
020f7150 1415 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1416
1417 # Retrieve video webpage to extract further information
1418 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1419 try:
1420 self.report_download_webpage(video_id)
1421 webpage = urllib2.urlopen(request).read()
1422 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1423 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1424 return
020f7150
RG
1425
1426 # Extract URL, uploader and title from webpage
1427 self.report_extraction(video_id)
18963a36 1428 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1429 if mobj is not None:
1430 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1431 video_extension = mediaURL[-3:]
d3975459 1432
c6c555cf
RG
1433 # Extract gdaKey if available
1434 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1435 if mobj is None:
1436 video_url = mediaURL
1437 else:
1438 gdaKey = mobj.group(1)
1439 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1440 else:
c6c555cf
RG
1441 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1442 if mobj is None:
1443 self._downloader.trouble(u'ERROR: unable to extract media URL')
1444 return
1445 vardict = parse_qs(mobj.group(1))
1446 if 'mediaData' not in vardict:
1447 self._downloader.trouble(u'ERROR: unable to extract media URL')
1448 return
1449 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1450 if mobj is None:
1451 self._downloader.trouble(u'ERROR: unable to extract media URL')
1452 return
6b57e8c5
RG
1453 mediaURL = mobj.group(1).replace('\\/', '/')
1454 video_extension = mediaURL[-3:]
1455 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1456
2546e767 1457 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1458 if mobj is None:
147753eb 1459 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1460 return
020f7150 1461 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1462 video_title = sanitize_title(video_title)
020f7150 1463
29f07568 1464 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1465 if mobj is None:
147753eb 1466 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1467 return
dbccb6cd 1468 video_uploader = mobj.group(1)
020f7150 1469
42bcd27d 1470 try:
1471 # Process video information
1472 self._downloader.process_info({
1473 'id': video_id.decode('utf-8'),
1474 'url': video_url.decode('utf-8'),
1475 'uploader': video_uploader.decode('utf-8'),
138b11f3 1476 'upload_date': u'NA',
42bcd27d 1477 'title': video_title,
1478 'stitle': simple_title,
1479 'ext': video_extension.decode('utf-8'),
6ba562b0 1480 'format': u'NA',
e616ec0c 1481 'player_url': None,
42bcd27d 1482 })
73f4e7af 1483 except UnavailableVideoError:
09cc744c 1484 self._downloader.trouble(u'\nERROR: unable to download video')
020f7150 1485
25af2bce 1486
4135fa45
WB
1487class DailymotionIE(InfoExtractor):
1488 """Information Extractor for Dailymotion"""
1489
1490 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
4135fa45
WB
1491
1492 def __init__(self, downloader=None):
1493 InfoExtractor.__init__(self, downloader)
1494
1495 @staticmethod
1496 def suitable(url):
1497 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1498
4135fa45
WB
1499 def report_download_webpage(self, video_id):
1500 """Report webpage download."""
331ce0a0 1501 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
d3975459 1502
4135fa45
WB
1503 def report_extraction(self, video_id):
1504 """Report information extraction."""
331ce0a0 1505 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45
WB
1506
1507 def _real_initialize(self):
1508 return
1509
4135fa45
WB
1510 def _real_extract(self, url):
1511 # Extract id and simplified title from URL
1512 mobj = re.match(self._VALID_URL, url)
1513 if mobj is None:
1514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1515 return
1516
df372a65 1517 # At this point we have a new video
9bf7fa52 1518 self._downloader.increment_downloads()
4135fa45
WB
1519 video_id = mobj.group(1)
1520
1521 simple_title = mobj.group(2).decode('utf-8')
1522 video_extension = 'flv'
1523
1524 # Retrieve video webpage to extract further information
1525 request = urllib2.Request(url)
62a29bbf 1526 request.add_header('Cookie', 'family_filter=off')
4135fa45
WB
1527 try:
1528 self.report_download_webpage(video_id)
1529 webpage = urllib2.urlopen(request).read()
1530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1531 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1532 return
1533
1534 # Extract URL, uploader and title from webpage
1535 self.report_extraction(video_id)
62a29bbf 1536 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
4135fa45
WB
1537 if mobj is None:
1538 self._downloader.trouble(u'ERROR: unable to extract media URL')
1539 return
62a29bbf 1540 sequence = urllib.unquote(mobj.group(1))
1541 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1542 if mobj is None:
1543 self._downloader.trouble(u'ERROR: unable to extract media URL')
1544 return
1545 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
4135fa45
WB
1546
1547 # if needed add http://www.dailymotion.com/ if relative URL
1548
1549 video_url = mediaURL
1550
62a29bbf 1551 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
4135fa45
WB
1552 if mobj is None:
1553 self._downloader.trouble(u'ERROR: unable to extract title')
1554 return
1555 video_title = mobj.group(1).decode('utf-8')
1556 video_title = sanitize_title(video_title)
1557
62a29bbf 1558 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
4135fa45
WB
1559 if mobj is None:
1560 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1561 return
1562 video_uploader = mobj.group(1)
1563
1564 try:
1565 # Process video information
1566 self._downloader.process_info({
1567 'id': video_id.decode('utf-8'),
1568 'url': video_url.decode('utf-8'),
1569 'uploader': video_uploader.decode('utf-8'),
138b11f3 1570 'upload_date': u'NA',
4135fa45
WB
1571 'title': video_title,
1572 'stitle': simple_title,
1573 'ext': video_extension.decode('utf-8'),
1574 'format': u'NA',
1575 'player_url': None,
1576 })
73f4e7af 1577 except UnavailableVideoError:
09cc744c 1578 self._downloader.trouble(u'\nERROR: unable to download video')
4135fa45 1579
c0a10ca8 1580
49c0028a 1581class GoogleIE(InfoExtractor):
1582 """Information extractor for video.google.com."""
1583
490fd7ae 1584 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
49c0028a 1585
1586 def __init__(self, downloader=None):
1587 InfoExtractor.__init__(self, downloader)
1588
1589 @staticmethod
1590 def suitable(url):
1591 return (re.match(GoogleIE._VALID_URL, url) is not None)
1592
1593 def report_download_webpage(self, video_id):
1594 """Report webpage download."""
331ce0a0 1595 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1596
1597 def report_extraction(self, video_id):
1598 """Report information extraction."""
331ce0a0 1599 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1600
1601 def _real_initialize(self):
1602 return
1603
1604 def _real_extract(self, url):
1605 # Extract id from URL
1606 mobj = re.match(self._VALID_URL, url)
1607 if mobj is None:
1608 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1609 return
1610
df372a65 1611 # At this point we have a new video
9bf7fa52 1612 self._downloader.increment_downloads()
49c0028a 1613 video_id = mobj.group(1)
1614
1615 video_extension = 'mp4'
1616
1617 # Retrieve video webpage to extract further information
490fd7ae 1618 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1619 try:
1620 self.report_download_webpage(video_id)
1621 webpage = urllib2.urlopen(request).read()
1622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1623 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1624 return
1625
1626 # Extract URL, uploader, and title from webpage
1627 self.report_extraction(video_id)
490fd7ae
RG
1628 mobj = re.search(r"download_url:'([^']+)'", webpage)
1629 if mobj is None:
1630 video_extension = 'flv'
1631 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1632 if mobj is None:
1633 self._downloader.trouble(u'ERROR: unable to extract media URL')
1634 return
1635 mediaURL = urllib.unquote(mobj.group(1))
1636 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1637 mediaURL = mediaURL.replace('\\x26', '\x26')
1638
1639 video_url = mediaURL
1640
1641 mobj = re.search(r'<title>(.*)</title>', webpage)
1642 if mobj is None:
1643 self._downloader.trouble(u'ERROR: unable to extract title')
1644 return
1645 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1646 video_title = sanitize_title(video_title)
31cbdaaf 1647 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1648
7e58d568
RG
1649 # Extract video description
1650 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1651 if mobj is None:
1652 self._downloader.trouble(u'ERROR: unable to extract video description')
1653 return
1654 video_description = mobj.group(1).decode('utf-8')
1655 if not video_description:
1656 video_description = 'No description available.'
1657
1658 # Extract video thumbnail
1659 if self._downloader.params.get('forcethumbnail', False):
1660 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1661 try:
1662 webpage = urllib2.urlopen(request).read()
1663 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1664 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1665 return
1666 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1667 if mobj is None:
1668 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1669 return
1670 video_thumbnail = mobj.group(1)
1671 else: # we need something to pass to process_info
1672 video_thumbnail = ''
1673
49c0028a 1674 try:
1675 # Process video information
1676 self._downloader.process_info({
1677 'id': video_id.decode('utf-8'),
1678 'url': video_url.decode('utf-8'),
6ba562b0 1679 'uploader': u'NA',
138b11f3 1680 'upload_date': u'NA',
490fd7ae 1681 'title': video_title,
31cbdaaf 1682 'stitle': simple_title,
49c0028a 1683 'ext': video_extension.decode('utf-8'),
6ba562b0 1684 'format': u'NA',
e616ec0c 1685 'player_url': None,
49c0028a 1686 })
73f4e7af 1687 except UnavailableVideoError:
09cc744c 1688 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1689
1690
1691class PhotobucketIE(InfoExtractor):
1692 """Information extractor for photobucket.com."""
1693
1694 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1695
1696 def __init__(self, downloader=None):
1697 InfoExtractor.__init__(self, downloader)
1698
1699 @staticmethod
1700 def suitable(url):
1701 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1702
1703 def report_download_webpage(self, video_id):
1704 """Report webpage download."""
331ce0a0 1705 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1706
1707 def report_extraction(self, video_id):
1708 """Report information extraction."""
331ce0a0 1709 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1710
1711 def _real_initialize(self):
1712 return
1713
1714 def _real_extract(self, url):
1715 # Extract id from URL
1716 mobj = re.match(self._VALID_URL, url)
1717 if mobj is None:
1718 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1719 return
1720
df372a65 1721 # At this point we have a new video
9bf7fa52 1722 self._downloader.increment_downloads()
49c0028a 1723 video_id = mobj.group(1)
1724
1725 video_extension = 'flv'
1726
1727 # Retrieve video webpage to extract further information
1728 request = urllib2.Request(url)
1729 try:
1730 self.report_download_webpage(video_id)
1731 webpage = urllib2.urlopen(request).read()
1732 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1733 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1734 return
1735
1736 # Extract URL, uploader, and title from webpage
1737 self.report_extraction(video_id)
1738 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1739 if mobj is None:
1740 self._downloader.trouble(u'ERROR: unable to extract media URL')
1741 return
1742 mediaURL = urllib.unquote(mobj.group(1))
1743
1744 video_url = mediaURL
1745
1746 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1747 if mobj is None:
1748 self._downloader.trouble(u'ERROR: unable to extract title')
1749 return
1750 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1751 video_title = sanitize_title(video_title)
31cbdaaf 1752 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1753
1754 video_uploader = mobj.group(2).decode('utf-8')
1755
1756 try:
1757 # Process video information
1758 self._downloader.process_info({
1759 'id': video_id.decode('utf-8'),
1760 'url': video_url.decode('utf-8'),
490fd7ae 1761 'uploader': video_uploader,
138b11f3 1762 'upload_date': u'NA',
490fd7ae 1763 'title': video_title,
31cbdaaf 1764 'stitle': simple_title,
490fd7ae 1765 'ext': video_extension.decode('utf-8'),
6ba562b0 1766 'format': u'NA',
e616ec0c 1767 'player_url': None,
490fd7ae 1768 })
73f4e7af 1769 except UnavailableVideoError:
09cc744c 1770 self._downloader.trouble(u'\nERROR: unable to download video')
490fd7ae
RG
1771
1772
61945318
RG
1773class YahooIE(InfoExtractor):
1774 """Information extractor for video.yahoo.com."""
1775
1776 # _VALID_URL matches all Yahoo! Video URLs
1777 # _VPAGE_URL matches only the extractable '/watch/' URLs
1778 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1779 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1780
1781 def __init__(self, downloader=None):
1782 InfoExtractor.__init__(self, downloader)
1783
1784 @staticmethod
1785 def suitable(url):
1786 return (re.match(YahooIE._VALID_URL, url) is not None)
1787
1788 def report_download_webpage(self, video_id):
1789 """Report webpage download."""
331ce0a0 1790 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1791
1792 def report_extraction(self, video_id):
1793 """Report information extraction."""
331ce0a0 1794 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318
RG
1795
1796 def _real_initialize(self):
1797 return
1798
df372a65 1799 def _real_extract(self, url, new_video=True):
61945318
RG
1800 # Extract ID from URL
1801 mobj = re.match(self._VALID_URL, url)
1802 if mobj is None:
1803 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1804 return
1805
df372a65 1806 # At this point we have a new video
9bf7fa52 1807 self._downloader.increment_downloads()
61945318
RG
1808 video_id = mobj.group(2)
1809 video_extension = 'flv'
1810
1811 # Rewrite valid but non-extractable URLs as
1812 # extractable English language /watch/ URLs
1813 if re.match(self._VPAGE_URL, url) is None:
1814 request = urllib2.Request(url)
1815 try:
1816 webpage = urllib2.urlopen(request).read()
1817 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1819 return
1820
1821 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1822 if mobj is None:
1823 self._downloader.trouble(u'ERROR: Unable to extract id field')
1824 return
1825 yahoo_id = mobj.group(1)
1826
1827 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1828 if mobj is None:
1829 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1830 return
1831 yahoo_vid = mobj.group(1)
1832
1833 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1834 return self._real_extract(url, new_video=False)
61945318
RG
1835
1836 # Retrieve video webpage to extract further information
1837 request = urllib2.Request(url)
1838 try:
1839 self.report_download_webpage(video_id)
1840 webpage = urllib2.urlopen(request).read()
1841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843 return
1844
1845 # Extract uploader and title from webpage
1846 self.report_extraction(video_id)
1847 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1848 if mobj is None:
1849 self._downloader.trouble(u'ERROR: unable to extract video title')
1850 return
1851 video_title = mobj.group(1).decode('utf-8')
1852 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1853
1854 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1855 if mobj is None:
1856 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1857 return
1858 video_uploader = mobj.group(1).decode('utf-8')
1859
7e58d568
RG
1860 # Extract video thumbnail
1861 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1862 if mobj is None:
1863 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1864 return
1865 video_thumbnail = mobj.group(1).decode('utf-8')
1866
1867 # Extract video description
1868 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1869 if mobj is None:
1870 self._downloader.trouble(u'ERROR: unable to extract video description')
1871 return
1872 video_description = mobj.group(1).decode('utf-8')
c0a10ca8
F
1873 if not video_description:
1874 video_description = 'No description available.'
7e58d568 1875
61945318
RG
1876 # Extract video height and width
1877 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1878 if mobj is None:
1879 self._downloader.trouble(u'ERROR: unable to extract video height')
1880 return
1881 yv_video_height = mobj.group(1)
1882
1883 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1884 if mobj is None:
1885 self._downloader.trouble(u'ERROR: unable to extract video width')
1886 return
1887 yv_video_width = mobj.group(1)
1888
1889 # Retrieve video playlist to extract media URL
1890 # I'm not completely sure what all these options are, but we
1891 # seem to need most of them, otherwise the server sends a 401.
1892 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1893 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1894 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
c0a10ca8
F
1895 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1896 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
61945318
RG
1897 try:
1898 self.report_download_webpage(video_id)
1899 webpage = urllib2.urlopen(request).read()
1900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1901 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1902 return
1903
1904 # Extract media URL from playlist XML
1905 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1906 if mobj is None:
1907 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1908 return
1909 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1910 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1911
1912 try:
1913 # Process video information
1914 self._downloader.process_info({
1915 'id': video_id.decode('utf-8'),
1916 'url': video_url,
1917 'uploader': video_uploader,
138b11f3 1918 'upload_date': u'NA',
61945318
RG
1919 'title': video_title,
1920 'stitle': simple_title,
1921 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1922 'thumbnail': video_thumbnail.decode('utf-8'),
1923 'description': video_description,
1924 'thumbnail': video_thumbnail,
e616ec0c 1925 'player_url': None,
61945318 1926 })
73f4e7af 1927 except UnavailableVideoError:
09cc744c 1928 self._downloader.trouble(u'\nERROR: unable to download video')
61945318
RG
1929
1930
92743d42
RB
1931class VimeoIE(InfoExtractor):
1932 """Information extractor for vimeo.com."""
1933
1934 # _VALID_URL matches Vimeo URLs
44c636df 1935 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
92743d42
RB
1936
1937 def __init__(self, downloader=None):
1938 InfoExtractor.__init__(self, downloader)
1939
1940 @staticmethod
1941 def suitable(url):
1942 return (re.match(VimeoIE._VALID_URL, url) is not None)
1943
1944 def report_download_webpage(self, video_id):
1945 """Report webpage download."""
0ecedbdb 1946 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
92743d42
RB
1947
1948 def report_extraction(self, video_id):
1949 """Report information extraction."""
0ecedbdb 1950 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
92743d42
RB
1951
1952 def _real_initialize(self):
1953 return
1954
1955 def _real_extract(self, url, new_video=True):
1956 # Extract ID from URL
1957 mobj = re.match(self._VALID_URL, url)
1958 if mobj is None:
1959 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1960 return
1961
1962 # At this point we have a new video
1963 self._downloader.increment_downloads()
1964 video_id = mobj.group(1)
92743d42
RB
1965
1966 # Retrieve video webpage to extract further information
1967 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1968 try:
1969 self.report_download_webpage(video_id)
1970 webpage = urllib2.urlopen(request).read()
1971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1973 return
1974
f24c674b
RB
1975 # Now we begin extracting as much information as we can from what we
1976 # retrieved. First we extract the information common to all extractors,
1977 # and latter we extract those that are Vimeo specific.
92743d42 1978 self.report_extraction(video_id)
f24c674b
RB
1979
1980 # Extract title
c5a088d3 1981 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
92743d42
RB
1982 if mobj is None:
1983 self._downloader.trouble(u'ERROR: unable to extract video title')
1984 return
1985 video_title = mobj.group(1).decode('utf-8')
1986 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1987
f24c674b 1988 # Extract uploader
c5a088d3 1989 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
92743d42
RB
1990 if mobj is None:
1991 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1992 return
1993 video_uploader = mobj.group(1).decode('utf-8')
1994
1995 # Extract video thumbnail
c5a088d3 1996 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
92743d42
RB
1997 if mobj is None:
1998 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1999 return
2000 video_thumbnail = mobj.group(1).decode('utf-8')
2001
2002 # # Extract video description
2003 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2004 # if mobj is None:
2005 # self._downloader.trouble(u'ERROR: unable to extract video description')
2006 # return
2007 # video_description = mobj.group(1).decode('utf-8')
2008 # if not video_description: video_description = 'No description available.'
2009 video_description = 'Foo.'
2010
f24c674b 2011 # Vimeo specific: extract request signature
c5a088d3 2012 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
92743d42
RB
2013 if mobj is None:
2014 self._downloader.trouble(u'ERROR: unable to extract request signature')
2015 return
2016 sig = mobj.group(1).decode('utf-8')
2017
f24c674b 2018 # Vimeo specific: Extract request signature expiration
c5a088d3 2019 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
92743d42
RB
2020 if mobj is None:
2021 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2022 return
2023 sig_exp = mobj.group(1).decode('utf-8')
2024
2025 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2026
2027 try:
2028 # Process video information
2029 self._downloader.process_info({
2030 'id': video_id.decode('utf-8'),
2031 'url': video_url,
2032 'uploader': video_uploader,
2033 'upload_date': u'NA',
2034 'title': video_title,
2035 'stitle': simple_title,
2fc31a48 2036 'ext': u'mp4',
92743d42
RB
2037 'thumbnail': video_thumbnail.decode('utf-8'),
2038 'description': video_description,
2039 'thumbnail': video_thumbnail,
2040 'description': video_description,
2041 'player_url': None,
2042 })
2043 except UnavailableVideoError:
2044 self._downloader.trouble(u'ERROR: unable to download video')
2045
2046
490fd7ae
RG
2047class GenericIE(InfoExtractor):
2048 """Generic last-resort information extractor."""
2049
2050 def __init__(self, downloader=None):
2051 InfoExtractor.__init__(self, downloader)
2052
2053 @staticmethod
2054 def suitable(url):
2055 return True
2056
2057 def report_download_webpage(self, video_id):
2058 """Report webpage download."""
331ce0a0
RG
2059 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2060 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
2061
2062 def report_extraction(self, video_id):
2063 """Report information extraction."""
331ce0a0 2064 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae
RG
2065
2066 def _real_initialize(self):
2067 return
2068
2069 def _real_extract(self, url):
df372a65 2070 # At this point we have a new video
9bf7fa52 2071 self._downloader.increment_downloads()
df372a65 2072
490fd7ae
RG
2073 video_id = url.split('/')[-1]
2074 request = urllib2.Request(url)
2075 try:
2076 self.report_download_webpage(video_id)
2077 webpage = urllib2.urlopen(request).read()
2078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2079 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2080 return
2081 except ValueError, err:
2082 # since this is the last-resort InfoExtractor, if
2083 # this error is thrown, it'll be thrown here
2084 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2085 return
2086
a9806fd8 2087 self.report_extraction(video_id)
490fd7ae
RG
2088 # Start with something easy: JW Player in SWFObject
2089 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2090 if mobj is None:
2091 # Broaden the search a little bit
2092 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2093 if mobj is None:
2094 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2095 return
2096
2097 # It's possible that one of the regexes
2098 # matched, but returned an empty group:
2099 if mobj.group(1) is None:
2100 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2101 return
2102
2103 video_url = urllib.unquote(mobj.group(1))
c0a10ca8 2104 video_id = os.path.basename(video_url)
490fd7ae
RG
2105
2106 # here's a fun little line of code for you:
2107 video_extension = os.path.splitext(video_id)[1][1:]
c0a10ca8 2108 video_id = os.path.splitext(video_id)[0]
490fd7ae
RG
2109
2110 # it's tempting to parse this further, but you would
2111 # have to take into account all the variations like
2112 # Video Title - Site Name
2113 # Site Name | Video Title
2114 # Video Title - Tagline | Site Name
2115 # and so on and so forth; it's just not practical
2116 mobj = re.search(r'<title>(.*)</title>', webpage)
2117 if mobj is None:
2118 self._downloader.trouble(u'ERROR: unable to extract title')
2119 return
2120 video_title = mobj.group(1).decode('utf-8')
2121 video_title = sanitize_title(video_title)
31cbdaaf 2122 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
490fd7ae
RG
2123
2124 # video uploader is domain name
2125 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2126 if mobj is None:
2127 self._downloader.trouble(u'ERROR: unable to extract title')
2128 return
2129 video_uploader = mobj.group(1).decode('utf-8')
2130
2131 try:
2132 # Process video information
2133 self._downloader.process_info({
2134 'id': video_id.decode('utf-8'),
2135 'url': video_url.decode('utf-8'),
2136 'uploader': video_uploader,
138b11f3 2137 'upload_date': u'NA',
490fd7ae 2138 'title': video_title,
31cbdaaf 2139 'stitle': simple_title,
49c0028a 2140 'ext': video_extension.decode('utf-8'),
6ba562b0 2141 'format': u'NA',
e616ec0c 2142 'player_url': None,
49c0028a 2143 })
73f4e7af 2144 except UnavailableVideoError, err:
09cc744c 2145 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 2146
2147
25af2bce
RG
2148class YoutubeSearchIE(InfoExtractor):
2149 """Information Extractor for YouTube search queries."""
2150 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2151 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2152 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 2153 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 2154 _youtube_ie = None
fd9288c3 2155 _max_youtube_results = 1000
25af2bce 2156
f995f712 2157 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
2158 InfoExtractor.__init__(self, downloader)
2159 self._youtube_ie = youtube_ie
d3975459 2160
25af2bce
RG
2161 @staticmethod
2162 def suitable(url):
2163 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2164
2165 def report_download_page(self, query, pagenum):
2166 """Report attempt to download playlist page with given number."""
490fd7ae 2167 query = query.decode(preferredencoding())
331ce0a0 2168 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
2169
2170 def _real_initialize(self):
2171 self._youtube_ie.initialize()
d3975459 2172
25af2bce
RG
2173 def _real_extract(self, query):
2174 mobj = re.match(self._VALID_QUERY, query)
2175 if mobj is None:
147753eb 2176 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 2177 return
25af2bce
RG
2178
2179 prefix, query = query.split(':')
2180 prefix = prefix[8:]
c0a10ca8 2181 query = query.encode('utf-8')
f995f712 2182 if prefix == '':
6f21f686
RG
2183 self._download_n_results(query, 1)
2184 return
f995f712 2185 elif prefix == 'all':
6f21f686
RG
2186 self._download_n_results(query, self._max_youtube_results)
2187 return
f995f712 2188 else:
25af2bce 2189 try:
e1f18b8a 2190 n = long(prefix)
25af2bce 2191 if n <= 0:
147753eb 2192 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 2193 return
257453b9 2194 elif n > self._max_youtube_results:
c0a10ca8 2195 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 2196 n = self._max_youtube_results
6f21f686
RG
2197 self._download_n_results(query, n)
2198 return
e1f18b8a 2199 except ValueError: # parsing prefix as integer fails
6f21f686
RG
2200 self._download_n_results(query, 1)
2201 return
25af2bce
RG
2202
2203 def _download_n_results(self, query, n):
2204 """Downloads a specified number of results for a query"""
2205
2206 video_ids = []
2207 already_seen = set()
2208 pagenum = 1
2209
2210 while True:
2211 self.report_download_page(query, pagenum)
a9633f14 2212 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2213 request = urllib2.Request(result_url)
25af2bce
RG
2214 try:
2215 page = urllib2.urlopen(request).read()
2216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2217 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2218 return
25af2bce
RG
2219
2220 # Extract video identifiers
2221 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2222 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2223 if video_id not in already_seen:
2224 video_ids.append(video_id)
2225 already_seen.add(video_id)
2226 if len(video_ids) == n:
2227 # Specified n videos reached
25af2bce 2228 for id in video_ids:
6f21f686
RG
2229 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2230 return
25af2bce 2231
304a4d85 2232 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 2233 for id in video_ids:
6f21f686
RG
2234 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2235 return
25af2bce
RG
2236
2237 pagenum = pagenum + 1
2238
c0a10ca8 2239
7e58d568
RG
2240class GoogleSearchIE(InfoExtractor):
2241 """Information Extractor for Google Video search queries."""
2242 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2243 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2244 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2245 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2246 _google_ie = None
2247 _max_google_results = 1000
2248
2249 def __init__(self, google_ie, downloader=None):
2250 InfoExtractor.__init__(self, downloader)
2251 self._google_ie = google_ie
d3975459 2252
7e58d568
RG
2253 @staticmethod
2254 def suitable(url):
2255 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2256
2257 def report_download_page(self, query, pagenum):
2258 """Report attempt to download playlist page with given number."""
2259 query = query.decode(preferredencoding())
331ce0a0 2260 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2261
2262 def _real_initialize(self):
2263 self._google_ie.initialize()
d3975459 2264
7e58d568
RG
2265 def _real_extract(self, query):
2266 mobj = re.match(self._VALID_QUERY, query)
2267 if mobj is None:
2268 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2269 return
2270
2271 prefix, query = query.split(':')
2272 prefix = prefix[8:]
c0a10ca8 2273 query = query.encode('utf-8')
7e58d568
RG
2274 if prefix == '':
2275 self._download_n_results(query, 1)
2276 return
2277 elif prefix == 'all':
2278 self._download_n_results(query, self._max_google_results)
2279 return
2280 else:
2281 try:
2282 n = long(prefix)
2283 if n <= 0:
2284 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2285 return
2286 elif n > self._max_google_results:
c0a10ca8 2287 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
7e58d568
RG
2288 n = self._max_google_results
2289 self._download_n_results(query, n)
2290 return
2291 except ValueError: # parsing prefix as integer fails
2292 self._download_n_results(query, 1)
2293 return
2294
2295 def _download_n_results(self, query, n):
2296 """Downloads a specified number of results for a query"""
2297
2298 video_ids = []
2299 already_seen = set()
2300 pagenum = 1
2301
2302 while True:
2303 self.report_download_page(query, pagenum)
2304 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2305 request = urllib2.Request(result_url)
7e58d568
RG
2306 try:
2307 page = urllib2.urlopen(request).read()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2310 return
2311
2312 # Extract video identifiers
2313 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2314 video_id = mobj.group(1)
2315 if video_id not in already_seen:
2316 video_ids.append(video_id)
2317 already_seen.add(video_id)
2318 if len(video_ids) == n:
2319 # Specified n videos reached
2320 for id in video_ids:
2321 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2322 return
2323
2324 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2325 for id in video_ids:
2326 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2327 return
2328
2329 pagenum = pagenum + 1
2330
c0a10ca8 2331
7e58d568
RG
2332class YahooSearchIE(InfoExtractor):
2333 """Information Extractor for Yahoo! Video search queries."""
2334 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2335 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2336 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2337 _MORE_PAGES_INDICATOR = r'\s*Next'
2338 _yahoo_ie = None
2339 _max_yahoo_results = 1000
2340
2341 def __init__(self, yahoo_ie, downloader=None):
2342 InfoExtractor.__init__(self, downloader)
2343 self._yahoo_ie = yahoo_ie
d3975459 2344
7e58d568
RG
2345 @staticmethod
2346 def suitable(url):
2347 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2348
2349 def report_download_page(self, query, pagenum):
2350 """Report attempt to download playlist page with given number."""
2351 query = query.decode(preferredencoding())
331ce0a0 2352 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2353
2354 def _real_initialize(self):
2355 self._yahoo_ie.initialize()
d3975459 2356
7e58d568
RG
2357 def _real_extract(self, query):
2358 mobj = re.match(self._VALID_QUERY, query)
2359 if mobj is None:
2360 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2361 return
2362
2363 prefix, query = query.split(':')
2364 prefix = prefix[8:]
c0a10ca8 2365 query = query.encode('utf-8')
7e58d568
RG
2366 if prefix == '':
2367 self._download_n_results(query, 1)
2368 return
2369 elif prefix == 'all':
2370 self._download_n_results(query, self._max_yahoo_results)
2371 return
2372 else:
2373 try:
2374 n = long(prefix)
2375 if n <= 0:
2376 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2377 return
2378 elif n > self._max_yahoo_results:
c0a10ca8 2379 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
7e58d568
RG
2380 n = self._max_yahoo_results
2381 self._download_n_results(query, n)
2382 return
2383 except ValueError: # parsing prefix as integer fails
2384 self._download_n_results(query, 1)
2385 return
2386
2387 def _download_n_results(self, query, n):
2388 """Downloads a specified number of results for a query"""
2389
2390 video_ids = []
2391 already_seen = set()
2392 pagenum = 1
2393
2394 while True:
2395 self.report_download_page(query, pagenum)
2396 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2397 request = urllib2.Request(result_url)
7e58d568
RG
2398 try:
2399 page = urllib2.urlopen(request).read()
2400 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2402 return
2403
2404 # Extract video identifiers
2405 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2406 video_id = mobj.group(1)
2407 if video_id not in already_seen:
2408 video_ids.append(video_id)
2409 already_seen.add(video_id)
2410 if len(video_ids) == n:
2411 # Specified n videos reached
2412 for id in video_ids:
2413 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2414 return
2415
2416 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2417 for id in video_ids:
2418 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2419 return
2420
2421 pagenum = pagenum + 1
2422
c0a10ca8 2423
0c2dc87d
RG
2424class YoutubePlaylistIE(InfoExtractor):
2425 """Information Extractor for YouTube playlists."""
2426
2152ee86 2427 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
f74e22ae 2428 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
0c2dc87d 2429 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 2430 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d
RG
2431 _youtube_ie = None
2432
2433 def __init__(self, youtube_ie, downloader=None):
2434 InfoExtractor.__init__(self, downloader)
2435 self._youtube_ie = youtube_ie
d3975459 2436
0c2dc87d
RG
2437 @staticmethod
2438 def suitable(url):
2439 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2440
2441 def report_download_page(self, playlist_id, pagenum):
2442 """Report attempt to download playlist page with given number."""
331ce0a0 2443 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
2444
2445 def _real_initialize(self):
2446 self._youtube_ie.initialize()
d3975459 2447
0c2dc87d
RG
2448 def _real_extract(self, url):
2449 # Extract playlist id
2450 mobj = re.match(self._VALID_URL, url)
2451 if mobj is None:
147753eb 2452 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 2453 return
0c2dc87d 2454
d119b54d
RG
2455 # Single video case
2456 if mobj.group(3) is not None:
2457 self._youtube_ie.extract(mobj.group(3))
2458 return
2459
0c2dc87d 2460 # Download playlist pages
f74e22ae
GI
2461 # prefix is 'p' as default for playlists but there are other types that need extra care
2462 playlist_prefix = mobj.group(1)
2463 if playlist_prefix == 'a':
2464 playlist_access = 'artist'
2465 else:
7cc3c6fd 2466 playlist_prefix = 'p'
f74e22ae
GI
2467 playlist_access = 'view_play_list'
2468 playlist_id = mobj.group(2)
0c2dc87d
RG
2469 video_ids = []
2470 pagenum = 1
2471
2472 while True:
2473 self.report_download_page(playlist_id, pagenum)
f74e22ae 2474 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
0c2dc87d
RG
2475 try:
2476 page = urllib2.urlopen(request).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2478 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2479 return
0c2dc87d
RG
2480
2481 # Extract video identifiers
27d98b6e 2482 ids_in_page = []
0c2dc87d 2483 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
2484 if mobj.group(1) not in ids_in_page:
2485 ids_in_page.append(mobj.group(1))
2486 video_ids.extend(ids_in_page)
0c2dc87d 2487
ce5cafea 2488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
2489 break
2490 pagenum = pagenum + 1
2491
8cc44341
RG
2492 playliststart = self._downloader.params.get('playliststart', 1) - 1
2493 playlistend = self._downloader.params.get('playlistend', -1)
2494 video_ids = video_ids[playliststart:playlistend]
2495
0c2dc87d 2496 for id in video_ids:
6f21f686
RG
2497 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2498 return
0c2dc87d 2499
c0a10ca8 2500
c39c05cd
A
2501class YoutubeUserIE(InfoExtractor):
2502 """Information Extractor for YouTube users."""
2503
5aba6ea4 2504 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
c39c05cd 2505 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
5aba6ea4
RG
2506 _GDATA_PAGE_SIZE = 50
2507 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2508 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
c39c05cd
A
2509 _youtube_ie = None
2510
2511 def __init__(self, youtube_ie, downloader=None):
2512 InfoExtractor.__init__(self, downloader)
2513 self._youtube_ie = youtube_ie
d3975459 2514
c39c05cd
A
2515 @staticmethod
2516 def suitable(url):
2517 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2518
5aba6ea4 2519 def report_download_page(self, username, start_index):
c39c05cd 2520 """Report attempt to download user page."""
5aba6ea4 2521 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
c0a10ca8 2522 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
c39c05cd
A
2523
2524 def _real_initialize(self):
2525 self._youtube_ie.initialize()
d3975459 2526
c39c05cd
A
2527 def _real_extract(self, url):
2528 # Extract username
2529 mobj = re.match(self._VALID_URL, url)
2530 if mobj is None:
2531 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2532 return
2533
c39c05cd 2534 username = mobj.group(1)
5aba6ea4
RG
2535
2536 # Download video ids using YouTube Data API. Result size per
2537 # query is limited (currently to 50 videos) so we need to query
2538 # page by page until there are no video ids - it means we got
2539 # all of them.
2540
c39c05cd 2541 video_ids = []
5aba6ea4 2542 pagenum = 0
c39c05cd 2543
5aba6ea4
RG
2544 while True:
2545 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2546 self.report_download_page(username, start_index)
c39c05cd 2547
5aba6ea4 2548 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
c39c05cd 2549
5aba6ea4
RG
2550 try:
2551 page = urllib2.urlopen(request).read()
2552 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2554 return
c39c05cd 2555
5aba6ea4
RG
2556 # Extract video identifiers
2557 ids_in_page = []
2558
2559 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2560 if mobj.group(1) not in ids_in_page:
2561 ids_in_page.append(mobj.group(1))
2562
2563 video_ids.extend(ids_in_page)
2564
2565 # A little optimization - if current page is not
2566 # "full", ie. does not contain PAGE_SIZE video ids then
2567 # we can assume that this page is the last one - there
2568 # are no more ids on further pages - no need to query
2569 # again.
2570
2571 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2572 break
2573
2574 pagenum += 1
2575
2576 all_ids_count = len(video_ids)
8cc44341
RG
2577 playliststart = self._downloader.params.get('playliststart', 1) - 1
2578 playlistend = self._downloader.params.get('playlistend', -1)
204c9398 2579
5aba6ea4
RG
2580 if playlistend == -1:
2581 video_ids = video_ids[playliststart:]
2582 else:
2583 video_ids = video_ids[playliststart:playlistend]
7a9054ec 2584
5aba6ea4 2585 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
c0a10ca8 2586 (username, all_ids_count, len(video_ids)))
5aba6ea4
RG
2587
2588 for video_id in video_ids:
2589 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2590
c39c05cd 2591
27179cfd
VV
2592class DepositFilesIE(InfoExtractor):
2593 """Information extractor for depositfiles.com"""
2594
2595 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2596
2597 def __init__(self, downloader=None):
2598 InfoExtractor.__init__(self, downloader)
2599
2600 @staticmethod
2601 def suitable(url):
2602 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2603
2604 def report_download_webpage(self, file_id):
2605 """Report webpage download."""
2606 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2607
2608 def report_extraction(self, file_id):
2609 """Report information extraction."""
2610 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2611
2612 def _real_initialize(self):
2613 return
2614
2615 def _real_extract(self, url):
2616 # At this point we have a new file
2617 self._downloader.increment_downloads()
2618
2619 file_id = url.split('/')[-1]
2620 # Rebuild url in english locale
2621 url = 'http://depositfiles.com/en/files/' + file_id
2622
2623 # Retrieve file webpage with 'Free download' button pressed
2624 free_download_indication = { 'gateway_result' : '1' }
1987c232 2625 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
27179cfd
VV
2626 try:
2627 self.report_download_webpage(file_id)
2628 webpage = urllib2.urlopen(request).read()
2629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2630 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2631 return
2632
2633 # Search for the real file URL
2634 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2635 if (mobj is None) or (mobj.group(1) is None):
2636 # Try to figure out reason of the error.
2637 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2638 if (mobj is not None) and (mobj.group(1) is not None):
2639 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2640 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2641 else:
2642 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2643 return
2644
2645 file_url = mobj.group(1)
2646 file_extension = os.path.splitext(file_url)[1][1:]
2647
2648 # Search for file title
2649 mobj = re.search(r'<b title="(.*?)">', webpage)
2650 if mobj is None:
2651 self._downloader.trouble(u'ERROR: unable to extract title')
2652 return
2653 file_title = mobj.group(1).decode('utf-8')
2654
2655 try:
2656 # Process file information
2657 self._downloader.process_info({
2658 'id': file_id.decode('utf-8'),
2659 'url': file_url.decode('utf-8'),
2660 'uploader': u'NA',
2661 'upload_date': u'NA',
2662 'title': file_title,
2663 'stitle': file_title,
2664 'ext': file_extension.decode('utf-8'),
2665 'format': u'NA',
2666 'player_url': None,
2667 })
2668 except UnavailableVideoError, err:
2669 self._downloader.trouble(u'ERROR: unable to download file')
2670
c0a10ca8 2671
9f5f9602
GI
2672class FacebookIE(InfoExtractor):
2673 """Information Extractor for Facebook"""
2674
2675 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2676 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2677 _NETRC_MACHINE = 'facebook'
2678 _available_formats = ['highqual', 'lowqual']
2679 _video_extensions = {
2680 'highqual': 'mp4',
2681 'lowqual': 'mp4',
2682 }
2683
2684 def __init__(self, downloader=None):
2685 InfoExtractor.__init__(self, downloader)
2686
2687 @staticmethod
2688 def suitable(url):
2689 return (re.match(FacebookIE._VALID_URL, url) is not None)
2690
2691 def _reporter(self, message):
2692 """Add header and report message."""
2693 self._downloader.to_screen(u'[facebook] %s' % message)
2694
2695 def report_login(self):
2696 """Report attempt to log in."""
2697 self._reporter(u'Logging in')
2698
2699 def report_video_webpage_download(self, video_id):
2700 """Report attempt to download video webpage."""
2701 self._reporter(u'%s: Downloading video webpage' % video_id)
2702
2703 def report_information_extraction(self, video_id):
2704 """Report attempt to extract video information."""
2705 self._reporter(u'%s: Extracting video information' % video_id)
2706
2707 def _parse_page(self, video_webpage):
2708 """Extract video information from page"""
2709 # General data
2710 data = {'title': r'class="video_title datawrap">(.*?)</',
2711 'description': r'<div class="datawrap">(.*?)</div>',
2712 'owner': r'\("video_owner_name", "(.*?)"\)',
2713 'upload_date': r'data-date="(.*?)"',
2714 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2715 }
2716 video_info = {}
2717 for piece in data.keys():
2718 mobj = re.search(data[piece], video_webpage)
2719 if mobj is not None:
2720 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2721
2722 # Video urls
2723 video_urls = {}
2724 for fmt in self._available_formats:
2725 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2726 if mobj is not None:
2727 # URL is in a Javascript segment inside an escaped Unicode format within
2728 # the generally utf-8 page
2729 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2730 video_info['video_urls'] = video_urls
2731
2732 return video_info
2733
2734 def _real_initialize(self):
2735 if self._downloader is None:
2736 return
2737
2738 useremail = None
2739 password = None
2740 downloader_params = self._downloader.params
2741
2742 # Attempt to use provided username and password or .netrc data
2743 if downloader_params.get('username', None) is not None:
2744 useremail = downloader_params['username']
2745 password = downloader_params['password']
2746 elif downloader_params.get('usenetrc', False):
2747 try:
2748 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2749 if info is not None:
2750 useremail = info[0]
2751 password = info[2]
2752 else:
2753 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2754 except (IOError, netrc.NetrcParseError), err:
2755 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2756 return
2757
2758 if useremail is None:
2759 return
2760
2761 # Log in
2762 login_form = {
2763 'email': useremail,
2764 'pass': password,
2765 'login': 'Log+In'
2766 }
2767 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2768 try:
2769 self.report_login()
2770 login_results = urllib2.urlopen(request).read()
2771 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2772 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2773 return
2774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2775 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2776 return
2777
2778 def _real_extract(self, url):
2779 mobj = re.match(self._VALID_URL, url)
2780 if mobj is None:
2781 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2782 return
2783 video_id = mobj.group('ID')
2784
2785 # Get video webpage
2786 self.report_video_webpage_download(video_id)
2787 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2788 try:
2789 page = urllib2.urlopen(request)
2790 video_webpage = page.read()
2791 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2792 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2793 return
2794
2795 # Start extracting information
2796 self.report_information_extraction(video_id)
2797
2798 # Extract information
2799 video_info = self._parse_page(video_webpage)
2800
2801 # uploader
2802 if 'owner' not in video_info:
2803 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2804 return
2805 video_uploader = video_info['owner']
2806
2807 # title
2808 if 'title' not in video_info:
2809 self._downloader.trouble(u'ERROR: unable to extract video title')
2810 return
2811 video_title = video_info['title']
2812 video_title = video_title.decode('utf-8')
2813 video_title = sanitize_title(video_title)
2814
2815 # simplified title
2816 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2817 simple_title = simple_title.strip(ur'_')
2818
2819 # thumbnail image
2820 if 'thumbnail' not in video_info:
2821 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2822 video_thumbnail = ''
2823 else:
2824 video_thumbnail = video_info['thumbnail']
2825
2826 # upload date
2827 upload_date = u'NA'
2828 if 'upload_date' in video_info:
2829 upload_time = video_info['upload_date']
2830 timetuple = email.utils.parsedate_tz(upload_time)
2831 if timetuple is not None:
2832 try:
2833 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2834 except:
2835 pass
2836
2837 # description
8b95c387 2838 video_description = video_info.get('description', 'No description available.')
9f5f9602
GI
2839
2840 url_map = video_info['video_urls']
2841 if len(url_map.keys()) > 0:
2842 # Decide which formats to download
2843 req_format = self._downloader.params.get('format', None)
2844 format_limit = self._downloader.params.get('format_limit', None)
2845
2846 if format_limit is not None and format_limit in self._available_formats:
2847 format_list = self._available_formats[self._available_formats.index(format_limit):]
2848 else:
2849 format_list = self._available_formats
2850 existing_formats = [x for x in format_list if x in url_map]
2851 if len(existing_formats) == 0:
2852 self._downloader.trouble(u'ERROR: no known formats available for video')
2853 return
2854 if req_format is None:
2855 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2856 elif req_format == '-1':
2857 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2858 else:
2859 # Specific format
2860 if req_format not in url_map:
2861 self._downloader.trouble(u'ERROR: requested format not available')
2862 return
2863 video_url_list = [(req_format, url_map[req_format])] # Specific format
2864
2865 for format_param, video_real_url in video_url_list:
2866
2867 # At this point we have a new video
2868 self._downloader.increment_downloads()
2869
2870 # Extension
2871 video_extension = self._video_extensions.get(format_param, 'mp4')
2872
9f5f9602
GI
2873 try:
2874 # Process video information
2875 self._downloader.process_info({
2876 'id': video_id.decode('utf-8'),
2877 'url': video_real_url.decode('utf-8'),
2878 'uploader': video_uploader.decode('utf-8'),
2879 'upload_date': upload_date,
2880 'title': video_title,
2881 'stitle': simple_title,
2882 'ext': video_extension.decode('utf-8'),
2883 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2884 'thumbnail': video_thumbnail.decode('utf-8'),
2885 'description': video_description.decode('utf-8'),
2886 'player_url': None,
2887 })
2888 except UnavailableVideoError, err:
2889 self._downloader.trouble(u'\nERROR: unable to download video')
2890
7745f5d8
PH
2891class BlipTVIE(InfoExtractor):
2892 """Information extractor for blip.tv"""
2893
1cab2c6d 2894 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
7745f5d8
PH
2895 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2896
2897 @staticmethod
2898 def suitable(url):
2899 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2900
7745f5d8
PH
2901 def report_extraction(self, file_id):
2902 """Report information extraction."""
aded78d9 2903 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
7745f5d8
PH
2904
2905 def _simplify_title(self, title):
2906 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2907 res = res.strip(ur'_')
2908 return res
2909
2910 def _real_extract(self, url):
2911 mobj = re.match(self._VALID_URL, url)
2912 if mobj is None:
2913 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2914 return
2915
1293ce58
PH
2916 if '?' in url:
2917 cchar = '&'
2918 else:
2919 cchar = '?'
2920 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
7745f5d8 2921 request = urllib2.Request(json_url)
aded78d9 2922 self.report_extraction(mobj.group(1))
7745f5d8
PH
2923 try:
2924 json_code = urllib2.urlopen(request).read()
2925 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2926 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2927 return
2928 try:
2929 json_data = json.loads(json_code)
1293ce58
PH
2930 if 'Post' in json_data:
2931 data = json_data['Post']
2932 else:
2933 data = json_data
7745f5d8
PH
2934
2935 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2936 video_url = data['media']['url']
2937 umobj = re.match(self._URL_EXT, video_url)
2938 if umobj is None:
2939 raise ValueError('Can not determine filename extension')
2940 ext = umobj.group(1)
2941
a1cab7ce
PH
2942 self._downloader.increment_downloads()
2943
7745f5d8
PH
2944 info = {
2945 'id': data['item_id'],
2946 'url': video_url,
2947 'uploader': data['display_name'],
2948 'upload_date': upload_date,
2949 'title': data['title'],
2950 'stitle': self._simplify_title(data['title']),
2951 'ext': ext,
2952 'format': data['media']['mimeType'],
2953 'thumbnail': data['thumbnailUrl'],
2954 'description': data['description'],
2955 'player_url': data['embedUrl']
2956 }
2957 except (ValueError,KeyError), err:
aded78d9 2958 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
7745f5d8
PH
2959 return
2960
2961 try:
2962 self._downloader.process_info(info)
2963 except UnavailableVideoError, err:
2964 self._downloader.trouble(u'\nERROR: unable to download video')
2965
2966
9b0a8bc1
PH
2967class MyVideoIE(InfoExtractor):
2968 """Information Extractor for myvideo.de."""
2969
2970 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2971
2972 def __init__(self, downloader=None):
2973 InfoExtractor.__init__(self, downloader)
2974
2975 @staticmethod
2976 def suitable(url):
2977 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2978
2979 def report_download_webpage(self, video_id):
2980 """Report webpage download."""
2981 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2982
2983 def report_extraction(self, video_id):
2984 """Report information extraction."""
2985 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2986
2987 def _real_initialize(self):
2988 return
2989
2990 def _real_extract(self,url):
2991 mobj = re.match(self._VALID_URL, url)
2992 if mobj is None:
2993 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2994 return
2995
2996 video_id = mobj.group(1)
2997 simple_title = mobj.group(2).decode('utf-8')
2998 # should actually not be necessary
2999 simple_title = sanitize_title(simple_title)
3000 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3001
3002 # Get video webpage
3003 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3004 try:
3005 self.report_download_webpage(video_id)
3006 webpage = urllib2.urlopen(request).read()
3007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3009 return
3010
3011 self.report_extraction(video_id)
3012 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3013 webpage)
3014 if mobj is None:
3015 self._downloader.trouble(u'ERROR: unable to extract media URL')
3016 return
3017 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3018
3019 mobj = re.search('<title>([^<]+)</title>', webpage)
3020 if mobj is None:
3021 self._downloader.trouble(u'ERROR: unable to extract title')
3022 return
3023
3024 video_title = mobj.group(1)
3025 video_title = sanitize_title(video_title)
3026
3027 try:
3028 print(video_url)
3029 self._downloader.process_info({
3030 'id': video_id,
3031 'url': video_url,
3032 'uploader': u'NA',
3033 'upload_date': u'NA',
3034 'title': video_title,
3035 'stitle': simple_title,
3036 'ext': u'flv',
3037 'format': u'NA',
3038 'player_url': None,
3039 })
3040 except UnavailableVideoError:
3041 self._downloader.trouble(u'\nERROR: Unable to download video')
3042
c8e30044 3043class ComedyCentralIE(InfoExtractor):
f166bccc 3044 """Information extractor for The Daily Show and Colbert Report """
c8e30044 3045
f166bccc 3046 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
c8e30044
PH
3047
3048 @staticmethod
3049 def suitable(url):
3050 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3051
3052 def report_extraction(self, episode_id):
3053 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3054
3055 def report_config_download(self, episode_id):
3056 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3057
fedf9f39
PH
3058 def report_player_url(self, episode_id):
3059 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3060
c8e30044
PH
3061 def _simplify_title(self, title):
3062 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3063 res = res.strip(ur'_')
3064 return res
3065
3066 def _real_extract(self, url):
3067 mobj = re.match(self._VALID_URL, url)
3068 if mobj is None:
3069 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070 return
f166bccc
PH
3071
3072 if mobj.group('shortname'):
3073 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3074 url = 'http://www.thedailyshow.com/full-episodes/'
3075 else:
3076 url = 'http://www.colbertnation.com/full-episodes/'
3077 mobj = re.match(self._VALID_URL, url)
3078 assert mobj is not None
3079
3080 dlNewest = not mobj.group('episode')
3081 if dlNewest:
3082 epTitle = mobj.group('showname')
3083 else:
3084 epTitle = mobj.group('episode')
c8e30044
PH
3085
3086 req = urllib2.Request(url)
3087 self.report_extraction(epTitle)
3088 try:
f166bccc
PH
3089 htmlHandle = urllib2.urlopen(req)
3090 html = htmlHandle.read()
c8e30044
PH
3091 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3092 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3093 return
f166bccc
PH
3094 if dlNewest:
3095 url = htmlHandle.geturl()
3096 mobj = re.match(self._VALID_URL, url)
3097 if mobj is None:
3098 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3099 return
3100 if mobj.group('episode') == '':
3101 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3102 return
3103 epTitle = mobj.group('episode')
c8e30044 3104
a88bc6bb 3105 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/(.*?:episode:([^:]*):)(.*?))"/>', html)
c8e30044
PH
3106 if len(mMovieParams) == 0:
3107 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3108 return
a88bc6bb
PH
3109 show_id = mMovieParams[0][2]
3110 ACT_COUNT = { # TODO: Detect this dynamically
3111 'thedailyshow.com': 4,
3112 'colbertnation.com': 3,
3113 }.get(show_id, 4)
3114 OFFSET = {
d793aeba 3115 'thedailyshow.com': 1,
a88bc6bb 3116 'colbertnation.com': 1,
d793aeba 3117 }.get(show_id, 1)
a88bc6bb 3118
fedf9f39 3119 first_player_url = mMovieParams[0][0]
5991ddfd 3120 startMediaNum = int(mMovieParams[0][3]) + OFFSET
0f862ea1 3121 movieId = mMovieParams[0][1]
c8e30044 3122
fedf9f39
PH
3123 playerReq = urllib2.Request(first_player_url)
3124 self.report_player_url(epTitle)
3125 try:
3126 playerResponse = urllib2.urlopen(playerReq)
3127 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3128 self._downloader.trouble(u'ERROR: unable to download player: %s' % unicode(err))
3129 return
3130 player_url = playerResponse.geturl()
3131
c8e30044 3132 for actNum in range(ACT_COUNT):
5991ddfd
PH
3133 mediaNum = startMediaNum + actNum
3134 mediaId = movieId + str(mediaNum)
c8e30044
PH
3135 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3136 urllib.urlencode({'uri': mediaId}))
3137 configReq = urllib2.Request(configUrl)
3138 self.report_config_download(epTitle)
3139 try:
3140 configXml = urllib2.urlopen(configReq).read()
3141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3143 return
46c8c432 3144
c8e30044
PH
3145 cdoc = xml.etree.ElementTree.fromstring(configXml)
3146 turls = []
3147 for rendition in cdoc.findall('.//rendition'):
3148 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3149 turls.append(finfo)
3150
a88bc6bb 3151 if len(turls) == 0:
5991ddfd 3152 self._downloader.trouble(u'\nERROR: unable to download ' + str(mediaNum) + ': No videos found')
a88bc6bb
PH
3153 continue
3154
c8e30044
PH
3155 # For now, just pick the highest bitrate
3156 format,video_url = turls[-1]
3157
3158 self._downloader.increment_downloads()
a88bc6bb
PH
3159
3160 effTitle = show_id.replace('.com', '') + '-' + epTitle
c8e30044 3161 info = {
5991ddfd 3162 'id': str(mediaNum),
c8e30044 3163 'url': video_url,
a88bc6bb 3164 'uploader': show_id,
c8e30044 3165 'upload_date': 'NA',
a88bc6bb
PH
3166 'title': effTitle,
3167 'stitle': self._simplify_title(effTitle),
c8e30044
PH
3168 'ext': 'mp4',
3169 'format': format,
3170 'thumbnail': None,
3171 'description': 'TODO: Not yet supported',
0f862ea1 3172 'player_url': player_url
c8e30044 3173 }
46c8c432 3174
c8e30044
PH
3175 try:
3176 self._downloader.process_info(info)
3177 except UnavailableVideoError, err:
5991ddfd 3178 self._downloader.trouble(u'\nERROR: unable to download ' + str(mediaNum))
a88bc6bb 3179 continue
c8e30044
PH
3180
3181
65cd34c5
RG
3182class PostProcessor(object):
3183 """Post Processor class.
3184
3185 PostProcessor objects can be added to downloaders with their
3186 add_post_processor() method. When the downloader has finished a
3187 successful download, it will take its internal chain of PostProcessors
3188 and start calling the run() method on each one of them, first with
3189 an initial argument and then with the returned value of the previous
3190 PostProcessor.
3191
3192 The chain will be stopped if one of them ever returns None or the end
3193 of the chain is reached.
3194
3195 PostProcessor objects follow a "mutual registration" process similar
3196 to InfoExtractor objects.
3197 """
3198
3199 _downloader = None
3200
3201 def __init__(self, downloader=None):
3202 self._downloader = downloader
3203
65cd34c5
RG
3204 def set_downloader(self, downloader):
3205 """Sets the downloader for this PP."""
3206 self._downloader = downloader
d3975459 3207
65cd34c5
RG
3208 def run(self, information):
3209 """Run the PostProcessor.
3210
3211 The "information" argument is a dictionary like the ones
2f11508a 3212 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
3213 one has an extra field called "filepath" that points to the
3214 downloaded file.
3215
3216 When this method returns None, the postprocessing chain is
3217 stopped. However, this method may return an information
3218 dictionary that will be passed to the next postprocessing
3219 object in the chain. It can be the one it received after
3220 changing some fields.
3221
3222 In addition, this method may raise a PostProcessingError
3223 exception that will be taken into account by the downloader
3224 it was called from.
3225 """
3226 return information # by default, do nothing
d3975459 3227
c0a10ca8 3228
3072fab1
RG
3229class FFmpegExtractAudioPP(PostProcessor):
3230
3231 def __init__(self, downloader=None, preferredcodec=None):
3232 PostProcessor.__init__(self, downloader)
3233 if preferredcodec is None:
3234 preferredcodec = 'best'
3235 self._preferredcodec = preferredcodec
3236
3237 @staticmethod
3238 def get_audio_codec(path):
da273188 3239 try:
2727dbf7
RG
3240 cmd = ['ffprobe', '-show_streams', '--', path]
3241 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
da273188
RG
3242 output = handle.communicate()[0]
3243 if handle.wait() != 0:
3244 return None
3245 except (IOError, OSError):
3072fab1
RG
3246 return None
3247 audio_codec = None
3248 for line in output.split('\n'):
3249 if line.startswith('codec_name='):
3250 audio_codec = line.split('=')[1].strip()
3251 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3252 return audio_codec
3253 return None
3254
3255 @staticmethod
3256 def run_ffmpeg(path, out_path, codec, more_opts):
3257 try:
2727dbf7
RG
3258 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3259 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3072fab1
RG
3260 return (ret == 0)
3261 except (IOError, OSError):
3262 return False
3263
3264 def run(self, information):
3265 path = information['filepath']
3266
3267 filecodec = self.get_audio_codec(path)
3268 if filecodec is None:
da273188 3269 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3072fab1
RG
3270 return None
3271
3272 more_opts = []
3273 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3274 if filecodec == 'aac' or filecodec == 'mp3':
3275 # Lossless if possible
3276 acodec = 'copy'
3277 extension = filecodec
3278 if filecodec == 'aac':
3279 more_opts = ['-f', 'adts']
3280 else:
3281 # MP3 otherwise.
3282 acodec = 'libmp3lame'
3283 extension = 'mp3'
3284 more_opts = ['-ab', '128k']
3285 else:
3286 # We convert the audio (lossy)
3287 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3288 extension = self._preferredcodec
3289 more_opts = ['-ab', '128k']
3290 if self._preferredcodec == 'aac':
3291 more_opts += ['-f', 'adts']
3292
3293 (prefix, ext) = os.path.splitext(path)
3294 new_path = prefix + '.' + extension
3295 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3296 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3297
3298 if not status:
1bd92582 3299 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3072fab1
RG
3300 return None
3301
3302 try:
3303 os.remove(path)
3304 except (IOError, OSError):
3305 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3306 return None
3307
3308 information['filepath'] = new_path
3309 return information
3310
5fb3df4a
GV
3311
3312def updateSelf(downloader, filename):
3313 ''' Update the program file with the latest version from the repository '''
3314 # Note: downloader only used for options
3315 if not os.access(filename, os.W_OK):
3316 sys.exit('ERROR: no write permissions on %s' % filename)
3317
d207e7cf 3318 downloader.to_screen('Updating to latest version...')
5fb3df4a 3319
4fa74b52 3320 try:
d207e7cf
PH
3321 try:
3322 urlh = urllib.urlopen(UPDATE_URL)
3323 newcontent = urlh.read()
3324 finally:
3325 urlh.close()
5fb3df4a
GV
3326 except (IOError, OSError), err:
3327 sys.exit('ERROR: unable to download latest version')
f9f1e798 3328
5fb3df4a 3329 try:
d207e7cf
PH
3330 outf = open(filename, 'wb')
3331 try:
3332 outf.write(newcontent)
3333 finally:
3334 outf.close()
5fb3df4a
GV
3335 except (IOError, OSError), err:
3336 sys.exit('ERROR: unable to overwrite current version')
4bec29ef 3337
d207e7cf 3338 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
80066952 3339
4f9f96f6
GV
3340def parseOpts():
3341 # Deferred imports
3342 import getpass
3343 import optparse
e7cf18cb 3344
4f9f96f6
GV
3345 def _format_option_string(option):
3346 ''' ('-o', '--option') -> -o, --format METAVAR'''
80066952 3347
4f9f96f6
GV
3348 opts = []
3349
3350 if option._short_opts: opts.append(option._short_opts[0])
3351 if option._long_opts: opts.append(option._long_opts[0])
3352 if len(opts) > 1: opts.insert(1, ', ')
3353
3354 if option.takes_value(): opts.append(' %s' % option.metavar)
3355
3356 return "".join(opts)
3357
6a4f0a11
GV
3358 def _find_term_columns():
3359 columns = os.environ.get('COLUMNS', None)
2c8d32de
PH
3360 if columns:
3361 return int(columns)
3362
4f2a5e06
PH
3363 try:
3364 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3365 out,err = sp.communicate()
eb0387a8 3366 return int(out.split()[1])
4f2a5e06
PH
3367 except:
3368 pass
2c8d32de 3369 return None
6a4f0a11 3370
51c8e53f
GV
3371 max_width = 80
3372 max_help_position = 80
3373
3374 # No need to wrap help messages if we're on a wide console
6a4f0a11 3375 columns = _find_term_columns()
51c8e53f
GV
3376 if columns: max_width = columns
3377
3378 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4f9f96f6
GV
3379 fmt.format_option_strings = _format_option_string
3380
3381 kw = {
3382 'version' : __version__,
3383 'formatter' : fmt,
2c8d32de 3384 'usage' : '%prog [options] url...',
4f9f96f6
GV
3385 'conflict_handler' : 'resolve',
3386 }
3387
3388 parser = optparse.OptionParser(**kw)
3389
3390 # option groups
3391 general = optparse.OptionGroup(parser, 'General Options')
3392 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3393 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3394 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3395 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3396 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3397
3398 general.add_option('-h', '--help',
3399 action='help', help='print this help text and exit')
3400 general.add_option('-v', '--version',
3401 action='version', help='print program version and exit')
3402 general.add_option('-U', '--update',
e0e56865 3403 action='store_true', dest='update_self', help='update this program to latest version')
4f9f96f6
GV
3404 general.add_option('-i', '--ignore-errors',
3405 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3406 general.add_option('-r', '--rate-limit',
3407 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3408 general.add_option('-R', '--retries',
3409 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3410 general.add_option('--playlist-start',
3411 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3412 general.add_option('--playlist-end',
3413 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3414 general.add_option('--dump-user-agent',
3415 action='store_true', dest='dump_user_agent',
3416 help='display the current browser identification', default=False)
3417
3418 authentication.add_option('-u', '--username',
3419 dest='username', metavar='USERNAME', help='account username')
3420 authentication.add_option('-p', '--password',
3421 dest='password', metavar='PASSWORD', help='account password')
3422 authentication.add_option('-n', '--netrc',
3423 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3424
3425
3426 video_format.add_option('-f', '--format',
3427 action='store', dest='format', metavar='FORMAT', help='video format code')
3428 video_format.add_option('--all-formats',
3429 action='store_const', dest='format', help='download all available video formats', const='-1')
3430 video_format.add_option('--max-quality',
3431 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3432
3433
3434 verbosity.add_option('-q', '--quiet',
3435 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3436 verbosity.add_option('-s', '--simulate',
3437 action='store_true', dest='simulate', help='do not download video', default=False)
3438 verbosity.add_option('-g', '--get-url',
3439 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3440 verbosity.add_option('-e', '--get-title',
3441 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3442 verbosity.add_option('--get-thumbnail',
3443 action='store_true', dest='getthumbnail',
3444 help='simulate, quiet but print thumbnail URL', default=False)
3445 verbosity.add_option('--get-description',
3446 action='store_true', dest='getdescription',
3447 help='simulate, quiet but print video description', default=False)
3448 verbosity.add_option('--get-filename',
3449 action='store_true', dest='getfilename',
3450 help='simulate, quiet but print output filename', default=False)
3451 verbosity.add_option('--no-progress',
3452 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3453 verbosity.add_option('--console-title',
3454 action='store_true', dest='consoletitle',
3455 help='display progress in console titlebar', default=False)
3456
3457
3458 filesystem.add_option('-t', '--title',
3459 action='store_true', dest='usetitle', help='use title in file name', default=False)
3460 filesystem.add_option('-l', '--literal',
3461 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3462 filesystem.add_option('-A', '--auto-number',
3463 action='store_true', dest='autonumber',
3464 help='number downloaded files starting from 00000', default=False)
3465 filesystem.add_option('-o', '--output',
3466 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3467 filesystem.add_option('-a', '--batch-file',
3468 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3469 filesystem.add_option('-w', '--no-overwrites',
3470 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3471 filesystem.add_option('-c', '--continue',
3472 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3473 filesystem.add_option('--cookies',
3474 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3475 filesystem.add_option('--no-part',
3476 action='store_true', dest='nopart', help='do not use .part files', default=False)
3477 filesystem.add_option('--no-mtime',
3478 action='store_false', dest='updatetime',
3479 help='do not use the Last-modified header to set the file modification time', default=True)
2c8d32de
PH
3480 filesystem.add_option('--write-description',
3481 action='store_true', dest='writedescription',
3482 help='write video description to a .description file', default=False)
3483 filesystem.add_option('--write-info-json',
3484 action='store_true', dest='writeinfojson',
3485 help='write video metadata to a .info.json file', default=False)
4f9f96f6
GV
3486
3487
3488 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3489 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3490 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3491 help='"best", "aac" or "mp3"; best by default')
3492
3493
3494 parser.add_option_group(general)
3495 parser.add_option_group(filesystem)
3496 parser.add_option_group(verbosity)
3497 parser.add_option_group(video_format)
3498 parser.add_option_group(authentication)
3499 parser.add_option_group(postproc)
3500
3501 opts, args = parser.parse_args()
3502
3503 return parser, opts, args
3504
5adcaa43
GV
3505def main():
3506 parser, opts, args = parseOpts()
4f9f96f6 3507
5adcaa43
GV
3508 # Open appropriate CookieJar
3509 if opts.cookiefile is None:
3510 jar = cookielib.CookieJar()
3511 else:
8cc44341 3512 try:
5adcaa43
GV
3513 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3514 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3515 jar.load()
3516 except (IOError, OSError), err:
3517 sys.exit(u'ERROR: unable to open cookie file')
80066952 3518
5adcaa43
GV
3519 # Dump user agent
3520 if opts.dump_user_agent:
3521 print std_headers['User-Agent']
3522 sys.exit(0)
e7cf18cb 3523
5adcaa43
GV
3524 # General configuration
3525 cookie_processor = urllib2.HTTPCookieProcessor(jar)
c8e30044
PH
3526 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3527 urllib2.install_opener(opener)
5adcaa43 3528 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
80066952 3529
5adcaa43
GV
3530 # Batch file verification
3531 batchurls = []
3532 if opts.batchfile is not None:
8cc44341 3533 try:
5adcaa43
GV
3534 if opts.batchfile == '-':
3535 batchfd = sys.stdin
4bec29ef 3536 else:
5adcaa43
GV
3537 batchfd = open(opts.batchfile, 'r')
3538 batchurls = batchfd.readlines()
3539 batchurls = [x.strip() for x in batchurls]
3540 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3541 except IOError:
3542 sys.exit(u'ERROR: batch file could not be read')
3543 all_urls = batchurls + args
3544
3545 # Conflicting, missing and erroneous options
3546 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3547 parser.error(u'using .netrc conflicts with giving username/password')
3548 if opts.password is not None and opts.username is None:
3549 parser.error(u'account username missing')
3550 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3551 parser.error(u'using output template conflicts with using title, literal title or auto number')
3552 if opts.usetitle and opts.useliteral:
3553 parser.error(u'using title conflicts with using literal title')
3554 if opts.username is not None and opts.password is None:
3555 opts.password = getpass.getpass(u'Type account password and press return:')
3556 if opts.ratelimit is not None:
3557 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3558 if numeric_limit is None:
3559 parser.error(u'invalid rate limit specified')
3560 opts.ratelimit = numeric_limit
3561 if opts.retries is not None:
8cc44341 3562 try:
5adcaa43 3563 opts.retries = long(opts.retries)
8cc44341 3564 except (TypeError, ValueError), err:
5adcaa43
GV
3565 parser.error(u'invalid retry count specified')
3566 try:
2c8d32de 3567 opts.playliststart = int(opts.playliststart)
5adcaa43 3568 if opts.playliststart <= 0:
2c8d32de 3569 raise ValueError(u'Playlist start must be positive')
5adcaa43
GV
3570 except (TypeError, ValueError), err:
3571 parser.error(u'invalid playlist start number specified')
3572 try:
2c8d32de 3573 opts.playlistend = int(opts.playlistend)
5adcaa43 3574 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2c8d32de 3575 raise ValueError(u'Playlist end must be greater than playlist start')
5adcaa43
GV
3576 except (TypeError, ValueError), err:
3577 parser.error(u'invalid playlist end number specified')
3578 if opts.extractaudio:
3579 if opts.audioformat not in ['best', 'aac', 'mp3']:
3580 parser.error(u'invalid audio format specified')
3581
3582 # Information extractors
3583 youtube_ie = YoutubeIE()
3584 metacafe_ie = MetacafeIE(youtube_ie)
3585 dailymotion_ie = DailymotionIE()
3586 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3587 youtube_user_ie = YoutubeUserIE(youtube_ie)
3588 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3589 google_ie = GoogleIE()
3590 google_search_ie = GoogleSearchIE(google_ie)
3591 photobucket_ie = PhotobucketIE()
3592 yahoo_ie = YahooIE()
3593 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3594 deposit_files_ie = DepositFilesIE()
3595 facebook_ie = FacebookIE()
2c8d32de 3596 bliptv_ie = BlipTVIE()
9c3e23fb 3597 vimeo_ie = VimeoIE()
9b0a8bc1 3598 myvideo_ie = MyVideoIE()
c8e30044 3599 comedycentral_ie = ComedyCentralIE()
9b0a8bc1 3600
5adcaa43
GV
3601 generic_ie = GenericIE()
3602
3603 # File downloader
3604 fd = FileDownloader({
3605 'usenetrc': opts.usenetrc,
3606 'username': opts.username,
3607 'password': opts.password,
3608 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3609 'forceurl': opts.geturl,
3610 'forcetitle': opts.gettitle,
3611 'forcethumbnail': opts.getthumbnail,
3612 'forcedescription': opts.getdescription,
3613 'forcefilename': opts.getfilename,
3614 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3615 'format': opts.format,
3616 'format_limit': opts.format_limit,
3617 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3618 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3619 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3620 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3621 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3622 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3623 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3624 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3625 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3626 or u'%(id)s.%(ext)s'),
3627 'ignoreerrors': opts.ignoreerrors,
3628 'ratelimit': opts.ratelimit,
3629 'nooverwrites': opts.nooverwrites,
3630 'retries': opts.retries,
3631 'continuedl': opts.continue_dl,
3632 'noprogress': opts.noprogress,
3633 'playliststart': opts.playliststart,
3634 'playlistend': opts.playlistend,
3635 'logtostderr': opts.outtmpl == '-',
3636 'consoletitle': opts.consoletitle,
3637 'nopart': opts.nopart,
3638 'updatetime': opts.updatetime,
2c8d32de
PH
3639 'writedescription': opts.writedescription,
3640 'writeinfojson': opts.writeinfojson,
5adcaa43
GV
3641 })
3642 fd.add_info_extractor(youtube_search_ie)
3643 fd.add_info_extractor(youtube_pl_ie)
3644 fd.add_info_extractor(youtube_user_ie)
3645 fd.add_info_extractor(metacafe_ie)
3646 fd.add_info_extractor(dailymotion_ie)
3647 fd.add_info_extractor(youtube_ie)
3648 fd.add_info_extractor(google_ie)
3649 fd.add_info_extractor(google_search_ie)
3650 fd.add_info_extractor(photobucket_ie)
3651 fd.add_info_extractor(yahoo_ie)
3652 fd.add_info_extractor(yahoo_search_ie)
3653 fd.add_info_extractor(deposit_files_ie)
3654 fd.add_info_extractor(facebook_ie)
2c8d32de 3655 fd.add_info_extractor(bliptv_ie)
9c3e23fb 3656 fd.add_info_extractor(vimeo_ie)
9b0a8bc1 3657 fd.add_info_extractor(myvideo_ie)
c8e30044 3658 fd.add_info_extractor(comedycentral_ie)
5adcaa43
GV
3659
3660 # This must come last since it's the
3661 # fallback if none of the others work
3662 fd.add_info_extractor(generic_ie)
3663
3664 # PostProcessors
3665 if opts.extractaudio:
3666 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3667
3668 # Update version
3669 if opts.update_self:
3670 updateSelf(fd, sys.argv[0])
3671
3672 # Maybe do nothing
3673 if len(all_urls) < 1:
3674 if not opts.update_self:
3675 parser.error(u'you must provide at least one URL')
3676 else:
3677 sys.exit()
3678 retcode = fd.download(all_urls)
80066952 3679
5adcaa43
GV
3680 # Dump cookie jar if requested
3681 if opts.cookiefile is not None:
3682 try:
3683 jar.save()
3684 except (IOError, OSError), err:
3685 sys.exit(u'ERROR: unable to save cookie jar')
80066952 3686
5adcaa43 3687 sys.exit(retcode)
80066952 3688
4fa74b52 3689
5adcaa43
GV
3690if __name__ == '__main__':
3691 try:
3692 main()
e5bf0f55
RG
3693 except DownloadError:
3694 sys.exit(1)
3695 except SameFileError:
76a7f364 3696 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 3697 except KeyboardInterrupt:
76a7f364 3698 sys.exit(u'\nERROR: Interrupted by user')
e9cb9c28
GV
3699
3700# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: