]> jfr.im git - yt-dlp.git/blame - youtube-dl
Update LATEST_VERSION (and wait for a script to do it so I do not forget ;) )
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
2770590d
GV
3
4__author__ = (
2c8d32de
PH
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
6ae796b1 12 'Rogério Brito',
eb11aacc 13 'Philipp Hagemeister',
6fc5b0bb 14 'Sören Schulze',
2770590d
GV
15 )
16
2c8d32de 17__license__ = 'Public Domain'
e5b9fac2 18__version__ = '2011.09.15'
2770590d 19
8236e851 20UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
d207e7cf 21
80066952 22import cookielib
a1f03c7b 23import datetime
1987c232 24import gzip
4fa74b52 25import htmlentitydefs
f9c68787 26import HTMLParser
4fa74b52 27import httplib
2546e767 28import locale
4fa74b52
RG
29import math
30import netrc
31import os
32import os.path
33import re
34import socket
35import string
0487b407 36import subprocess
4fa74b52
RG
37import sys
38import time
39import urllib
40import urllib2
c6b55a8d 41import warnings
1987c232 42import zlib
a04e80a4 43
0a3c8b62
PH
44if os.name == 'nt':
45 import ctypes
46
47try:
48 import email.utils
49except ImportError: # Python 2.4
50 import email.Utils
c6b55a8d
PH
51try:
52 import cStringIO as StringIO
53except ImportError:
54 import StringIO
55
a04e80a4
RG
56# parse_qs was moved from the cgi module to the urlparse module recently.
57try:
58 from urlparse import parse_qs
59except ImportError:
60 from cgi import parse_qs
4fa74b52 61
c6b55a8d
PH
62try:
63 import lxml.etree
2b70537d 64except ImportError:
c6b55a8d
PH
65 pass # Handled below
66
c8e30044
PH
67try:
68 import xml.etree.ElementTree
69except ImportError: # Python<2.5
70 pass # Not officially supported, but let it slip
71
f995f712 72std_headers = {
c44b9ee9 73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
4fa74b52 74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
a57ed21f 76 'Accept-Encoding': 'gzip, deflate',
4fa74b52
RG
77 'Accept-Language': 'en-us,en;q=0.5',
78}
79
80simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
437d76c1
PH
82try:
83 import json
91e6a385 84except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
437d76c1
PH
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
eae2666c
RG
195def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
f94b636c
RG
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
eae2666c 210
c0a10ca8 211
490fd7ae
RG
212def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
d3975459 214
490fd7ae
RG
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
c0a10ca8 238
490fd7ae 239def sanitize_title(utitle):
31bcb480 240 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
242 return utitle.replace(unicode(os.sep), u'%')
243
c0a10ca8 244
31bcb480
RG
245def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
131bc765 256 if filename == u'-':
e08878f4
RG
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 260 return (sys.stdout, filename)
31bcb480
RG
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
ca6a11fa 265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
c0a10ca8 271
09bd408c 272def timeconvert(timestr):
c0a10ca8
F
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
09bd408c 280
e5bf0f55
RG
281class DownloadError(Exception):
282 """Download Error exception.
d3975459 283
e5bf0f55
RG
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
286 error message.
287 """
288 pass
289
c0a10ca8 290
e5bf0f55
RG
291class SameFileError(Exception):
292 """Same File exception.
293
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
296 """
297 pass
298
c0a10ca8 299
65cd34c5
RG
300class PostProcessingError(Exception):
301 """Post Processing exception.
302
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
305 """
306 pass
307
c0a10ca8 308
73f4e7af 309class UnavailableVideoError(Exception):
7b7759f5 310 """Unavailable Format exception.
311
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
314 """
d69a1c91
RG
315 pass
316
c0a10ca8 317
d69a1c91
RG
318class ContentTooShortError(Exception):
319 """Content Too Short exception.
320
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
324 """
325 # Both in bytes
326 downloaded = None
327 expected = None
328
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
7b7759f5 332
c0a10ca8 333
1987c232
RG
334class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
336
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
c0a10ca8 343
1987c232
RG
344 Part of this code was copied from:
345
c0a10ca8
F
346 http://techknack.net/python-urllib2-handlers/
347
1987c232
RG
348 Andrew Rowls, the author of that code, agreed to release it to the
349 public domain.
350 """
351
352 @staticmethod
353 def deflate(data):
354 try:
355 return zlib.decompress(data, -zlib.MAX_WBITS)
356 except zlib.error:
357 return zlib.decompress(data)
c0a10ca8 358
7b531c0b
RG
359 @staticmethod
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
0f6b00b5
RG
363 ret = urllib2.addinfourl(stream, headers, url)
364 ret.code = code
365 return ret
c0a10ca8 366
1987c232
RG
367 def http_request(self, req):
368 for h in std_headers:
369 if h in req.headers:
370 del req.headers[h]
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
376 return req
377
378 def http_response(self, req, resp):
379 old_resp = resp
380 # gzip
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
7b531c0b 383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
384 resp.msg = old_resp.msg
385 # deflate
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
7b531c0b 388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
389 resp.msg = old_resp.msg
390 return resp
391
c0a10ca8 392
4fa74b52
RG
393class FileDownloader(object):
394 """File Downloader class.
395
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
402
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
4fa74b52
RG
410
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
4fa74b52
RG
417
418 Available options:
419
80066952
RG
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
9f796346 428 forcefilename: Force printing final filename.
80066952
RG
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
8cc44341 440 playlistend: Playlist item to end at.
20e91e83
ABP
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
331ce0a0 443 logtostderr: Log messages to stderr instead of stdout.
ccbd296b 444 consoletitle: Display progress in console window's titlebar.
3fb2c487 445 nopart: Do not use temporary .part files.
e3018902 446 updatetime: Use the Last-modified header to set output file timestamps.
8b95c387 447 writedescription: Write the video description to a .description file
6eb08fbf 448 writeinfojson: Write the video description to a .info.json file
4fa74b52
RG
449 """
450
d0a9affb 451 params = None
4fa74b52 452 _ies = []
65cd34c5 453 _pps = []
9bf386d7 454 _download_retcode = None
7d8d0612 455 _num_downloads = None
331ce0a0 456 _screen_file = None
4fa74b52
RG
457
458 def __init__(self, params):
1c5e2302 459 """Create a FileDownloader object with the given options."""
4fa74b52 460 self._ies = []
65cd34c5 461 self._pps = []
9bf386d7 462 self._download_retcode = 0
7d8d0612 463 self._num_downloads = 0
331ce0a0 464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 465 self.params = params
d3975459 466
4fa74b52
RG
467 @staticmethod
468 def format_bytes(bytes):
469 if bytes is None:
470 return 'N/A'
8497c36d
RG
471 if type(bytes) is str:
472 bytes = float(bytes)
473 if bytes == 0.0:
4fa74b52
RG
474 exponent = 0
475 else:
8497c36d 476 exponent = long(math.log(bytes, 1024.0))
4fa74b52 477 suffix = 'bkMGTPEZY'[exponent]
c0a10ca8 478 converted = float(bytes) / float(1024 ** exponent)
4fa74b52
RG
479 return '%.2f%s' % (converted, suffix)
480
481 @staticmethod
482 def calc_percent(byte_counter, data_len):
483 if data_len is None:
484 return '---.-%'
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487 @staticmethod
488 def calc_eta(start, now, total, current):
489 if total is None:
490 return '--:--'
491 dif = now - start
492 if current == 0 or dif < 0.001: # One millisecond
493 return '--:--'
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
497 if eta_mins > 99:
498 return '--:--'
499 return '%02d:%02d' % (eta_mins, eta_secs)
500
5121ef20 501 @staticmethod
4fa74b52
RG
502 def calc_speed(start, now, bytes):
503 dif = now - start
504 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 505 return '%10s' % '---b/s'
4fa74b52
RG
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508 @staticmethod
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
e1f18b8a 513 return long(new_max)
4fa74b52
RG
514 rate = bytes / elapsed_time
515 if rate > new_max:
e1f18b8a 516 return long(new_max)
4fa74b52 517 if rate < new_min:
e1f18b8a
RG
518 return long(new_min)
519 return long(rate)
4fa74b52 520
acd3d842
RG
521 @staticmethod
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525 if matchobj is None:
526 return None
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
530
4fa74b52
RG
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
533 self._ies.append(ie)
534 ie.set_downloader(self)
d3975459 535
65cd34c5
RG
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
538 self._pps.append(pp)
539 pp.set_downloader(self)
d3975459 540
331ce0a0 541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 542 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
543 try:
544 if not self.params.get('quiet', False):
331ce0a0
RG
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
43ab0ca4
RG
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
550 raise
d3975459 551
7e5cab67
RG
552 def to_stderr(self, message):
553 """Print message to stderr."""
eae2666c 554 print >>sys.stderr, message.encode(preferredencoding())
d3975459 555
ccbd296b
MM
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
559 return
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
22899cea
RG
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
d0a9affb 569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 570
0086d1ec
RG
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
573
574 Depending on if the downloader has been configured to ignore
e5bf0f55 575 download errors or not, this method may throw an exception or
9bf386d7 576 not when errors are found, after printing the message.
0086d1ec
RG
577 """
578 if message is not None:
579 self.to_stderr(message)
d0a9affb 580 if not self.params.get('ignoreerrors', False):
e5bf0f55 581 raise DownloadError(message)
9bf386d7 582 self._download_retcode = 1
0086d1ec 583
acd3d842
RG
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
d0a9affb 586 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
587 if rate_limit is None or byte_counter == 0:
588 return
589 now = time.time()
590 elapsed = now - start_time
591 if elapsed <= 0.0:
592 return
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
3fb2c487
RG
596
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
601 return filename
602 return filename + u'.part'
603
8cc42e7c
RG
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
607 return filename
608
62cf7aaf
RG
609 def try_rename(self, old_filename, new_filename):
610 try:
7d950ca1
RG
611 if old_filename == new_filename:
612 return
62cf7aaf
RG
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
c0a10ca8 616
e3018902
RG
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
620 return
621 if not os.path.isfile(filename):
622 return
623 timestr = last_modified_hdr
624 if timestr is None:
625 return
626 filetime = timeconvert(timestr)
627 if filetime is None:
628 return
629 try:
c0a10ca8 630 os.utime(filename, (time.time(), filetime))
e3018902
RG
631 except:
632 pass
acd3d842 633
8b95c387 634 def report_writedescription(self, descfn):
6eb08fbf
PH
635 """ Report that the description file is being written """
636 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
637
638 def report_writeinfojson(self, infofn):
639 """ Report that the metadata file has been written """
640 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
8b95c387 641
bafa5cd9
RG
642 def report_destination(self, filename):
643 """Report destination filename."""
331ce0a0 644 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
d3975459 645
bafa5cd9
RG
646 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647 """Report download progress."""
d9835247
RG
648 if self.params.get('noprogress', False):
649 return
331ce0a0 650 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 651 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
ccbd296b
MM
652 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
7db85b2c
RG
654
655 def report_resuming_byte(self, resume_len):
8a9f53be 656 """Report attempt to resume at given byte."""
331ce0a0 657 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
d3975459 658
7031008c 659 def report_retry(self, count, retries):
e86e9474 660 """Report retry in case of HTTP error 5xx"""
331ce0a0 661 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
d3975459 662
7db85b2c
RG
663 def report_file_already_downloaded(self, file_name):
664 """Report file has already been fully downloaded."""
43ab0ca4 665 try:
331ce0a0 666 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 667 except (UnicodeEncodeError), err:
331ce0a0 668 self.to_screen(u'[download] The file has already been downloaded')
d3975459 669
7db85b2c
RG
670 def report_unable_to_resume(self):
671 """Report it was impossible to resume download."""
331ce0a0 672 self.to_screen(u'[download] Unable to resume')
d3975459 673
bafa5cd9
RG
674 def report_finish(self):
675 """Report download finished."""
d9835247 676 if self.params.get('noprogress', False):
331ce0a0 677 self.to_screen(u'[download] Download completed')
d9835247 678 else:
331ce0a0 679 self.to_screen(u'')
d3975459 680
df372a65
RG
681 def increment_downloads(self):
682 """Increment the ordinal that assigns a number to each file."""
683 self._num_downloads += 1
bafa5cd9 684
9f796346
GI
685 def prepare_filename(self, info_dict):
686 """Generate the output filename."""
687 try:
688 template_dict = dict(info_dict)
689 template_dict['epoch'] = unicode(long(time.time()))
690 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691 filename = self.params['outtmpl'] % template_dict
692 return filename
693 except (ValueError, KeyError), err:
694 self.trouble(u'ERROR: invalid system charset or erroneous output template')
695 return None
696
c8619e01
RG
697 def process_info(self, info_dict):
698 """Process a single dictionary returned by an InfoExtractor."""
9f796346 699 filename = self.prepare_filename(info_dict)
c8619e01
RG
700 # Do nothing else if in simulate mode
701 if self.params.get('simulate', False):
cbfff4db
RG
702 # Forced printings
703 if self.params.get('forcetitle', False):
490fd7ae 704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 705 if self.params.get('forceurl', False):
490fd7ae 706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
7e58d568
RG
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
9f796346
GI
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
cbfff4db 713
9bf386d7 714 return
d3975459 715
9f796346 716 if filename is None:
38ed1344 717 return
20e91e83
ABP
718
719 matchtitle=self.params.get('matchtitle',False)
720 rejecttitle=self.params.get('rejecttitle',False)
721 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
724 return
725 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
727 return
728
850ab765 729 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 730 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 731 return
7b7759f5 732
c8619e01 733 try:
e5e74ffb
PH
734 dn = os.path.dirname(filename)
735 if dn != '' and not os.path.exists(dn):
736 os.makedirs(dn)
c8619e01 737 except (OSError, IOError), err:
cec3a53c 738 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
9bf386d7 739 return
7b7759f5 740
8b95c387
PH
741 if self.params.get('writedescription', False):
742 try:
743 descfn = filename + '.description'
6eb08fbf 744 self.report_writedescription(descfn)
1293ce58
PH
745 descfile = open(descfn, 'wb')
746 try:
8b95c387 747 descfile.write(info_dict['description'].encode('utf-8'))
1293ce58
PH
748 finally:
749 descfile.close()
8b95c387 750 except (OSError, IOError):
cec3a53c 751 self.trouble(u'ERROR: Cannot write description file ' + descfn)
8b95c387
PH
752 return
753
6eb08fbf
PH
754 if self.params.get('writeinfojson', False):
755 infofn = filename + '.info.json'
756 self.report_writeinfojson(infofn)
757 try:
758 json.dump
759 except (NameError,AttributeError):
760 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
761 return
762 try:
1293ce58
PH
763 infof = open(infofn, 'wb')
764 try:
6eb08fbf 765 json.dump(info_dict, infof)
1293ce58
PH
766 finally:
767 infof.close()
6eb08fbf 768 except (OSError, IOError):
cec3a53c 769 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
6eb08fbf
PH
770 return
771
c8619e01 772 try:
e616ec0c 773 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
c8619e01 774 except (OSError, IOError), err:
73f4e7af 775 raise UnavailableVideoError
c8619e01 776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
db7e31b8 777 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
9bf386d7 778 return
d69a1c91 779 except (ContentTooShortError, ), err:
db7e31b8 780 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
d69a1c91 781 return
7b7759f5 782
55e7c75e
RG
783 if success:
784 try:
785 self.post_process(filename, info_dict)
786 except (PostProcessingError), err:
db7e31b8 787 self.trouble(u'ERROR: postprocessing: %s' % str(err))
55e7c75e 788 return
c8619e01 789
4fa74b52
RG
790 def download(self, url_list):
791 """Download a given list of URLs."""
22899cea 792 if len(url_list) > 1 and self.fixed_template():
d0a9affb 793 raise SameFileError(self.params['outtmpl'])
22899cea 794
4fa74b52
RG
795 for url in url_list:
796 suitable_found = False
797 for ie in self._ies:
c8619e01 798 # Go to next InfoExtractor if not suitable
4fa74b52
RG
799 if not ie.suitable(url):
800 continue
c8619e01 801
4fa74b52
RG
802 # Suitable InfoExtractor found
803 suitable_found = True
c8619e01 804
6f21f686
RG
805 # Extract information from URL and process it
806 ie.extract(url)
65cd34c5 807
c8619e01 808 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 809 break
c8619e01 810
4fa74b52 811 if not suitable_found:
db7e31b8 812 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 813
9bf386d7 814 return self._download_retcode
65cd34c5
RG
815
816 def post_process(self, filename, ie_info):
817 """Run the postprocessing chain on the given file."""
818 info = dict(ie_info)
819 info['filepath'] = filename
820 for pp in self._pps:
821 info = pp.run(info)
822 if info is None:
823 break
d3975459 824
e616ec0c 825 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407 826 self.report_destination(filename)
62cf7aaf 827 tmpfilename = self.temp_name(filename)
0487b407
RG
828
829 # Check for rtmpdump first
830 try:
831 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832 except (OSError, IOError):
833 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
834 return False
835
836 # Download using rtmpdump. rtmpdump returns exit code 2 when
837 # the connection was interrumpted and resuming appears to be
838 # possible. This is part of rtmpdump's normal usage, AFAIK.
b487ef08 839 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
1c1821f8
RG
840 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841 while retval == 2 or retval == 1:
62cf7aaf 842 prevsize = os.path.getsize(tmpfilename)
331ce0a0 843 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 844 time.sleep(5.0) # This seems to be needed
1c1821f8 845 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
62cf7aaf 846 cursize = os.path.getsize(tmpfilename)
e616ec0c
RG
847 if prevsize == cursize and retval == 1:
848 break
b487ef08
PH
849 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850 if prevsize == cursize and retval == 2 and cursize > 1024:
851 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
852 retval = 0
853 break
0487b407 854 if retval == 0:
62cf7aaf
RG
855 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856 self.try_rename(tmpfilename, filename)
0487b407
RG
857 return True
858 else:
db7e31b8 859 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
860 return False
861
e616ec0c 862 def _do_download(self, filename, url, player_url):
62cf7aaf 863 # Check file already present
3fb2c487 864 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
62cf7aaf
RG
865 self.report_file_already_downloaded(filename)
866 return True
867
0487b407
RG
868 # Attempt to download using rtmpdump
869 if url.startswith('rtmp'):
e616ec0c 870 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 871
62cf7aaf 872 tmpfilename = self.temp_name(filename)
55e7c75e 873 stream = None
9c457d2a 874 open_mode = 'wb'
1987c232
RG
875
876 # Do not include the Accept-Encoding header
877 headers = {'Youtubedl-no-compression': 'True'}
878 basic_request = urllib2.Request(url, None, headers)
879 request = urllib2.Request(url, None, headers)
7db85b2c 880
9c457d2a 881 # Establish possible resume length
62cf7aaf
RG
882 if os.path.isfile(tmpfilename):
883 resume_len = os.path.getsize(tmpfilename)
55e7c75e
RG
884 else:
885 resume_len = 0
9c457d2a
RG
886
887 # Request parameters in case of being able to resume
850ab765 888 if self.params.get('continuedl', False) and resume_len != 0:
7db85b2c 889 self.report_resuming_byte(resume_len)
c0a10ca8 890 request.add_header('Range', 'bytes=%d-' % resume_len)
9c457d2a 891 open_mode = 'ab'
55e7c75e 892
7031008c
RG
893 count = 0
894 retries = self.params.get('retries', 0)
101e0d1e 895 while count <= retries:
7031008c
RG
896 # Establish connection
897 try:
898 data = urllib2.urlopen(request)
899 break
900 except (urllib2.HTTPError, ), err:
ac249f42 901 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 902 # Unexpected HTTP error
7031008c 903 raise
101e0d1e
RG
904 elif err.code == 416:
905 # Unable to resume (requested range not satisfiable)
906 try:
907 # Open the connection again without the range header
908 data = urllib2.urlopen(basic_request)
909 content_length = data.info()['Content-Length']
910 except (urllib2.HTTPError, ), err:
ac249f42 911 if err.code < 500 or err.code >= 600:
101e0d1e
RG
912 raise
913 else:
914 # Examine the reported length
268fb2bd 915 if (content_length is not None and
c0a10ca8 916 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
917 # The file had already been fully downloaded.
918 # Explanation to the above condition: in issue #175 it was revealed that
919 # YouTube sometimes adds or removes a few bytes from the end of the file,
920 # changing the file size slightly and causing problems for some users. So
921 # I decided to implement a suggested change and consider the file
922 # completely downloaded if the file size differs less than 100 bytes from
923 # the one in the hard drive.
101e0d1e 924 self.report_file_already_downloaded(filename)
62cf7aaf 925 self.try_rename(tmpfilename, filename)
101e0d1e
RG
926 return True
927 else:
928 # The length does not match, we start the download over
929 self.report_unable_to_resume()
930 open_mode = 'wb'
931 break
932 # Retry
933 count += 1
934 if count <= retries:
935 self.report_retry(count, retries)
936
937 if count > retries:
938 self.trouble(u'ERROR: giving up after %s retries' % retries)
939 return False
7db85b2c 940
4fa74b52 941 data_len = data.info().get('Content-length', None)
106d091e
RG
942 if data_len is not None:
943 data_len = long(data_len) + resume_len
4fa74b52 944 data_len_str = self.format_bytes(data_len)
106d091e 945 byte_counter = 0 + resume_len
4fa74b52
RG
946 block_size = 1024
947 start = time.time()
948 while True:
bafa5cd9 949 # Download and write
4fa74b52
RG
950 before = time.time()
951 data_block = data.read(block_size)
952 after = time.time()
975a91d0 953 if len(data_block) == 0:
4fa74b52 954 break
975a91d0 955 byte_counter += len(data_block)
55e7c75e
RG
956
957 # Open file just in time
958 if stream is None:
959 try:
62cf7aaf 960 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
dbddab27 961 assert stream is not None
8cc42e7c 962 filename = self.undo_temp_name(tmpfilename)
55e7c75e
RG
963 self.report_destination(filename)
964 except (OSError, IOError), err:
db7e31b8 965 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 966 return False
131efd1a
RG
967 try:
968 stream.write(data_block)
969 except (IOError, OSError), err:
d67e0974
RG
970 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
971 return False
975a91d0 972 block_size = self.best_block_size(after - before, len(data_block))
4fa74b52 973
55e7c75e
RG
974 # Progress message
975 percent_str = self.calc_percent(byte_counter, data_len)
975a91d0
RG
976 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
55e7c75e
RG
978 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
979
acd3d842 980 # Apply rate limit
975a91d0 981 self.slow_down(start, byte_counter - resume_len)
acd3d842 982
dbddab27
PH
983 if stream is None:
984 self.trouble(u'\nERROR: Did not get any data blocks')
985 return False
6f0ff3ba 986 stream.close()
bafa5cd9 987 self.report_finish()
b905e5f5 988 if data_len is not None and byte_counter != data_len:
d69a1c91 989 raise ContentTooShortError(byte_counter, long(data_len))
62cf7aaf 990 self.try_rename(tmpfilename, filename)
e3018902 991
09bd408c 992 # Update file modification time
e3018902
RG
993 if self.params.get('updatetime', True):
994 self.try_utime(filename, data.info().get('last-modified', None))
995
55e7c75e 996 return True
4fa74b52 997
c0a10ca8 998
4fa74b52
RG
999class InfoExtractor(object):
1000 """Information Extractor class.
1001
1002 Information extractors are the classes that, given a URL, extract
1003 information from the video (or videos) the URL refers to. This
1004 information includes the real video URL, the video title and simplified
2851b2ca
RG
1005 title, author and others. The information is stored in a dictionary
1006 which is then passed to the FileDownloader. The FileDownloader
1007 processes this information possibly downloading the video to the file
1008 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
1009 the following fields:
1010
1011 id: Video identifier.
1012 url: Final video URL.
1013 uploader: Nickname of the video uploader.
1014 title: Literal title.
1015 stitle: Simplified title.
1016 ext: Video filename extension.
6ba562b0 1017 format: Video format.
e616ec0c 1018 player_url: SWF Player URL (may be None).
4fa74b52 1019
7e58d568
RG
1020 The following fields are optional. Their primary purpose is to allow
1021 youtube-dl to serve as the backend for a video search function, such
1022 as the one in youtube2mp3. They are only used when their respective
1023 forced printing functions are called:
1024
1025 thumbnail: Full URL to a video thumbnail image.
1026 description: One-line video description.
1027
4fa74b52
RG
1028 Subclasses of this one should re-define the _real_initialize() and
1029 _real_extract() methods, as well as the suitable() static method.
1030 Probably, they should also be instantiated and added to the main
1031 downloader.
1032 """
1033
1034 _ready = False
1035 _downloader = None
1036
1037 def __init__(self, downloader=None):
1038 """Constructor. Receives an optional downloader."""
1039 self._ready = False
1040 self.set_downloader(downloader)
1041
1042 @staticmethod
1043 def suitable(url):
1044 """Receives a URL and returns True if suitable for this IE."""
020f7150 1045 return False
4fa74b52
RG
1046
1047 def initialize(self):
1c5e2302 1048 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
1049 if not self._ready:
1050 self._real_initialize()
1051 self._ready = True
1052
1053 def extract(self, url):
1054 """Extracts URL information and returns it in list of dicts."""
1055 self.initialize()
1056 return self._real_extract(url)
1057
1058 def set_downloader(self, downloader):
1059 """Sets the downloader for this IE."""
1060 self._downloader = downloader
d3975459 1061
4fa74b52
RG
1062 def _real_initialize(self):
1063 """Real initialization process. Redefine in subclasses."""
1064 pass
1065
1066 def _real_extract(self, url):
1067 """Real extraction process. Redefine in subclasses."""
1068 pass
1069
c0a10ca8 1070
4fa74b52
RG
1071class YoutubeIE(InfoExtractor):
1072 """Information extractor for youtube.com."""
1073
86e709d3 1074 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 1075 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 1076 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 1077 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 1078 _NETRC_MACHINE = 'youtube'
497cd3e6 1079 # Listed in order of quality
e0edf1e0 1080 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
7b7759f5 1081 _video_extensions = {
1082 '13': '3gp',
1083 '17': 'mp4',
1084 '18': 'mp4',
1085 '22': 'mp4',
d9bc015b 1086 '37': 'mp4',
9e9647d9 1087 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a
RG
1088 '43': 'webm',
1089 '45': 'webm',
7b7759f5 1090 }
4fa74b52 1091
020f7150
RG
1092 @staticmethod
1093 def suitable(url):
1094 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1095
72ac78b8
RG
1096 def report_lang(self):
1097 """Report attempt to set language."""
331ce0a0 1098 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 1099
bafa5cd9
RG
1100 def report_login(self):
1101 """Report attempt to log in."""
331ce0a0 1102 self._downloader.to_screen(u'[youtube] Logging in')
d3975459 1103
bafa5cd9
RG
1104 def report_age_confirmation(self):
1105 """Report attempt to confirm age."""
331ce0a0 1106 self._downloader.to_screen(u'[youtube] Confirming age')
d3975459 1107
e616ec0c
RG
1108 def report_video_webpage_download(self, video_id):
1109 """Report attempt to download video webpage."""
331ce0a0 1110 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
d3975459 1111
71b7300e
RG
1112 def report_video_info_webpage_download(self, video_id):
1113 """Report attempt to download video info webpage."""
331ce0a0 1114 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
d3975459 1115
bafa5cd9
RG
1116 def report_information_extraction(self, video_id):
1117 """Report attempt to extract video information."""
331ce0a0 1118 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
d3975459 1119
7b7759f5 1120 def report_unavailable_format(self, video_id, format):
1121 """Report extracted video URL."""
331ce0a0 1122 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
d3975459 1123
0487b407
RG
1124 def report_rtmp_download(self):
1125 """Indicate the download will use the RTMP protocol."""
331ce0a0 1126 self._downloader.to_screen(u'[youtube] RTMP download detected')
d3975459 1127
4fa74b52
RG
1128 def _real_initialize(self):
1129 if self._downloader is None:
1130 return
1131
1132 username = None
1133 password = None
d0a9affb 1134 downloader_params = self._downloader.params
4fa74b52
RG
1135
1136 # Attempt to use provided username and password or .netrc data
1137 if downloader_params.get('username', None) is not None:
1138 username = downloader_params['username']
1139 password = downloader_params['password']
1140 elif downloader_params.get('usenetrc', False):
1141 try:
1142 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143 if info is not None:
1144 username = info[0]
1145 password = info[2]
1146 else:
1147 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148 except (IOError, netrc.NetrcParseError), err:
6f21f686 1149 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
1150 return
1151
72ac78b8 1152 # Set language
1987c232 1153 request = urllib2.Request(self._LANG_URL)
72ac78b8
RG
1154 try:
1155 self.report_lang()
1156 urllib2.urlopen(request).read()
1157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1158 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
1159 return
1160
cc109403
RG
1161 # No authentication to be performed
1162 if username is None:
1163 return
1164
4fa74b52 1165 # Log in
9fcd8355
RG
1166 login_form = {
1167 'current_form': 'loginForm',
4fa74b52
RG
1168 'next': '/',
1169 'action_login': 'Log In',
1170 'username': username,
9fcd8355
RG
1171 'password': password,
1172 }
1987c232 1173 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
4fa74b52 1174 try:
bafa5cd9 1175 self.report_login()
4fa74b52
RG
1176 login_results = urllib2.urlopen(request).read()
1177 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 1178 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
1179 return
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1181 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52 1182 return
d3975459 1183
4fa74b52 1184 # Confirm age
9fcd8355
RG
1185 age_form = {
1186 'next_url': '/',
1187 'action_confirm': 'Confirm',
1188 }
1987c232 1189 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
4fa74b52 1190 try:
bafa5cd9 1191 self.report_age_confirmation()
4fa74b52
RG
1192 age_results = urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1194 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 1195 return
4fa74b52
RG
1196
1197 def _real_extract(self, url):
1198 # Extract video id from URL
020f7150 1199 mobj = re.match(self._VALID_URL, url)
4fa74b52 1200 if mobj is None:
147753eb 1201 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1202 return
4fa74b52
RG
1203 video_id = mobj.group(2)
1204
497cd3e6
RG
1205 # Get video webpage
1206 self.report_video_webpage_download(video_id)
1987c232 1207 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
497cd3e6
RG
1208 try:
1209 video_webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1212 return
968aa884 1213
497cd3e6 1214 # Attempt to extract SWF player URL
b620a5f8 1215 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
497cd3e6 1216 if mobj is not None:
b620a5f8 1217 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
497cd3e6
RG
1218 else:
1219 player_url = None
1220
1221 # Get video info
1222 self.report_video_info_webpage_download(video_id)
1223 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c0a10ca8 1225 % (video_id, el_type))
1987c232 1226 request = urllib2.Request(video_info_url)
e616ec0c 1227 try:
497cd3e6
RG
1228 video_info_webpage = urllib2.urlopen(request).read()
1229 video_info = parse_qs(video_info_webpage)
1230 if 'token' in video_info:
1231 break
e616ec0c 1232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 1233 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 1234 return
f95f29fd
RG
1235 if 'token' not in video_info:
1236 if 'reason' in video_info:
8e686771 1237 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
1238 else:
1239 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1240 return
1241
1242 # Start extracting information
497cd3e6
RG
1243 self.report_information_extraction(video_id)
1244
1245 # uploader
1246 if 'author' not in video_info:
1247 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1248 return
1249 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 1250
497cd3e6
RG
1251 # title
1252 if 'title' not in video_info:
1253 self._downloader.trouble(u'ERROR: unable to extract video title')
1254 return
1255 video_title = urllib.unquote_plus(video_info['title'][0])
1256 video_title = video_title.decode('utf-8')
1257 video_title = sanitize_title(video_title)
1258
1259 # simplified title
1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261 simple_title = simple_title.strip(ur'_')
1262
1263 # thumbnail image
1264 if 'thumbnail_url' not in video_info:
1265 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266 video_thumbnail = ''
1267 else: # don't panic if we can't find it
1268 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1269
b3a27b52
NA
1270 # upload date
1271 upload_date = u'NA'
3efa45c3 1272 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
b3a27b52 1273 if mobj is not None:
a1f03c7b 1274 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
87cbd213 1275 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
a1f03c7b
NA
1276 for expression in format_expressions:
1277 try:
1278 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1279 except:
1280 pass
b3a27b52 1281
497cd3e6 1282 # description
c6b55a8d
PH
1283 try:
1284 lxml.etree
1285 except NameError:
1286 video_description = u'No description available.'
8b95c387 1287 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
c6b55a8d
PH
1288 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289 if mobj is not None:
1290 video_description = mobj.group(1).decode('utf-8')
1291 else:
1292 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
91e6a385 1295 # TODO use another parser
497cd3e6 1296
5ce7d172
RG
1297 # token
1298 video_token = urllib.unquote_plus(video_info['token'][0])
1299
497cd3e6 1300 # Decide which formats to download
f83ae781 1301 req_format = self._downloader.params.get('format', None)
2e3a32e4 1302
f137bef9
PH
1303 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304 self.report_rtmp_download()
1305 video_url_list = [(None, video_info['conn'][0])]
f137bef9 1306 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
0ac22e4f 1307 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
8519c32d 1308 url_data = [parse_qs(uds) for uds in url_data_strs]
f137bef9 1309 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
8519c32d 1310 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
2b70537d 1311
497cd3e6
RG
1312 format_limit = self._downloader.params.get('format_limit', None)
1313 if format_limit is not None and format_limit in self._available_formats:
1314 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 1315 else:
497cd3e6
RG
1316 format_list = self._available_formats
1317 existing_formats = [x for x in format_list if x in url_map]
1318 if len(existing_formats) == 0:
1319 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 1320 return
f83ae781 1321 if req_format is None:
d157d259 1322 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
f83ae781 1323 elif req_format == '-1':
d157d259 1324 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
497cd3e6 1325 else:
5c132793
RG
1326 # Specific format
1327 if req_format not in url_map:
1328 self._downloader.trouble(u'ERROR: requested format not available')
1329 return
1330 video_url_list = [(req_format, url_map[req_format])] # Specific format
497cd3e6 1331 else:
f3dc18d8 1332 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
497cd3e6 1333 return
7b7759f5 1334
497cd3e6
RG
1335 for format_param, video_real_url in video_url_list:
1336 # At this point we have a new video
1337 self._downloader.increment_downloads()
1338
1339 # Extension
1340 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 1341
968aa884 1342 try:
7b7759f5 1343 # Process video information
1344 self._downloader.process_info({
1345 'id': video_id.decode('utf-8'),
1346 'url': video_real_url.decode('utf-8'),
1347 'uploader': video_uploader.decode('utf-8'),
138b11f3 1348 'upload_date': upload_date,
7b7759f5 1349 'title': video_title,
1350 'stitle': simple_title,
1351 'ext': video_extension.decode('utf-8'),
6ba562b0 1352 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568 1353 'thumbnail': video_thumbnail.decode('utf-8'),
c6b55a8d 1354 'description': video_description,
e616ec0c 1355 'player_url': player_url,
7b7759f5 1356 })
497cd3e6 1357 except UnavailableVideoError, err:
09cc744c 1358 self._downloader.trouble(u'\nERROR: unable to download video')
42bcd27d 1359
4fa74b52 1360
020f7150
RG
1361class MetacafeIE(InfoExtractor):
1362 """Information Extractor for metacafe.com."""
1363
1364 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 1365 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 1366 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150
RG
1367 _youtube_ie = None
1368
1369 def __init__(self, youtube_ie, downloader=None):
1370 InfoExtractor.__init__(self, downloader)
1371 self._youtube_ie = youtube_ie
1372
1373 @staticmethod
1374 def suitable(url):
1375 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1376
1377 def report_disclaimer(self):
1378 """Report disclaimer retrieval."""
331ce0a0 1379 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
1380
1381 def report_age_confirmation(self):
1382 """Report attempt to confirm age."""
331ce0a0 1383 self._downloader.to_screen(u'[metacafe] Confirming age')
d3975459 1384
020f7150
RG
1385 def report_download_webpage(self, video_id):
1386 """Report webpage download."""
331ce0a0 1387 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
d3975459 1388
020f7150
RG
1389 def report_extraction(self, video_id):
1390 """Report information extraction."""
331ce0a0 1391 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
1392
1393 def _real_initialize(self):
1394 # Retrieve disclaimer
1987c232 1395 request = urllib2.Request(self._DISCLAIMER)
020f7150
RG
1396 try:
1397 self.report_disclaimer()
1398 disclaimer = urllib2.urlopen(request).read()
1399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1400 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1401 return
1402
1403 # Confirm age
1404 disclaimer_form = {
2546e767 1405 'filters': '0',
020f7150
RG
1406 'submit': "Continue - I'm over 18",
1407 }
1987c232 1408 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
020f7150
RG
1409 try:
1410 self.report_age_confirmation()
1411 disclaimer = urllib2.urlopen(request).read()
1412 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1413 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150 1414 return
d3975459 1415
020f7150
RG
1416 def _real_extract(self, url):
1417 # Extract id and simplified title from URL
1418 mobj = re.match(self._VALID_URL, url)
1419 if mobj is None:
147753eb 1420 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1421 return
020f7150
RG
1422
1423 video_id = mobj.group(1)
1424
1425 # Check if video comes from YouTube
1426 mobj2 = re.match(r'^yt-(.*)$', video_id)
1427 if mobj2 is not None:
6f21f686
RG
1428 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1429 return
020f7150 1430
df372a65 1431 # At this point we have a new video
9bf7fa52 1432 self._downloader.increment_downloads()
df372a65 1433
020f7150 1434 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1435
1436 # Retrieve video webpage to extract further information
1437 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1438 try:
1439 self.report_download_webpage(video_id)
1440 webpage = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1442 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1443 return
020f7150
RG
1444
1445 # Extract URL, uploader and title from webpage
1446 self.report_extraction(video_id)
18963a36 1447 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1448 if mobj is not None:
1449 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1450 video_extension = mediaURL[-3:]
d3975459 1451
c6c555cf
RG
1452 # Extract gdaKey if available
1453 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1454 if mobj is None:
1455 video_url = mediaURL
1456 else:
1457 gdaKey = mobj.group(1)
1458 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1459 else:
c6c555cf
RG
1460 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1461 if mobj is None:
1462 self._downloader.trouble(u'ERROR: unable to extract media URL')
1463 return
1464 vardict = parse_qs(mobj.group(1))
1465 if 'mediaData' not in vardict:
1466 self._downloader.trouble(u'ERROR: unable to extract media URL')
1467 return
1468 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1469 if mobj is None:
1470 self._downloader.trouble(u'ERROR: unable to extract media URL')
1471 return
6b57e8c5
RG
1472 mediaURL = mobj.group(1).replace('\\/', '/')
1473 video_extension = mediaURL[-3:]
1474 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1475
2546e767 1476 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1477 if mobj is None:
147753eb 1478 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1479 return
020f7150 1480 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1481 video_title = sanitize_title(video_title)
020f7150 1482
29f07568 1483 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1484 if mobj is None:
147753eb 1485 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1486 return
dbccb6cd 1487 video_uploader = mobj.group(1)
020f7150 1488
42bcd27d 1489 try:
1490 # Process video information
1491 self._downloader.process_info({
1492 'id': video_id.decode('utf-8'),
1493 'url': video_url.decode('utf-8'),
1494 'uploader': video_uploader.decode('utf-8'),
138b11f3 1495 'upload_date': u'NA',
42bcd27d 1496 'title': video_title,
1497 'stitle': simple_title,
1498 'ext': video_extension.decode('utf-8'),
6ba562b0 1499 'format': u'NA',
e616ec0c 1500 'player_url': None,
42bcd27d 1501 })
73f4e7af 1502 except UnavailableVideoError:
09cc744c 1503 self._downloader.trouble(u'\nERROR: unable to download video')
020f7150 1504
25af2bce 1505
4135fa45
WB
1506class DailymotionIE(InfoExtractor):
1507 """Information Extractor for Dailymotion"""
1508
1509 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
4135fa45
WB
1510
1511 def __init__(self, downloader=None):
1512 InfoExtractor.__init__(self, downloader)
1513
1514 @staticmethod
1515 def suitable(url):
1516 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1517
4135fa45
WB
1518 def report_download_webpage(self, video_id):
1519 """Report webpage download."""
331ce0a0 1520 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
d3975459 1521
4135fa45
WB
1522 def report_extraction(self, video_id):
1523 """Report information extraction."""
331ce0a0 1524 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45
WB
1525
1526 def _real_initialize(self):
1527 return
1528
4135fa45
WB
1529 def _real_extract(self, url):
1530 # Extract id and simplified title from URL
1531 mobj = re.match(self._VALID_URL, url)
1532 if mobj is None:
1533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1534 return
1535
df372a65 1536 # At this point we have a new video
9bf7fa52 1537 self._downloader.increment_downloads()
4135fa45
WB
1538 video_id = mobj.group(1)
1539
1540 simple_title = mobj.group(2).decode('utf-8')
1541 video_extension = 'flv'
1542
1543 # Retrieve video webpage to extract further information
1544 request = urllib2.Request(url)
62a29bbf 1545 request.add_header('Cookie', 'family_filter=off')
4135fa45
WB
1546 try:
1547 self.report_download_webpage(video_id)
1548 webpage = urllib2.urlopen(request).read()
1549 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1550 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1551 return
1552
1553 # Extract URL, uploader and title from webpage
1554 self.report_extraction(video_id)
62a29bbf 1555 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
4135fa45
WB
1556 if mobj is None:
1557 self._downloader.trouble(u'ERROR: unable to extract media URL')
1558 return
62a29bbf 1559 sequence = urllib.unquote(mobj.group(1))
1560 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1561 if mobj is None:
1562 self._downloader.trouble(u'ERROR: unable to extract media URL')
1563 return
1564 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
4135fa45
WB
1565
1566 # if needed add http://www.dailymotion.com/ if relative URL
1567
1568 video_url = mediaURL
1569
62a29bbf 1570 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
4135fa45
WB
1571 if mobj is None:
1572 self._downloader.trouble(u'ERROR: unable to extract title')
1573 return
1574 video_title = mobj.group(1).decode('utf-8')
1575 video_title = sanitize_title(video_title)
1576
62a29bbf 1577 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
4135fa45
WB
1578 if mobj is None:
1579 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1580 return
1581 video_uploader = mobj.group(1)
1582
1583 try:
1584 # Process video information
1585 self._downloader.process_info({
1586 'id': video_id.decode('utf-8'),
1587 'url': video_url.decode('utf-8'),
1588 'uploader': video_uploader.decode('utf-8'),
138b11f3 1589 'upload_date': u'NA',
4135fa45
WB
1590 'title': video_title,
1591 'stitle': simple_title,
1592 'ext': video_extension.decode('utf-8'),
1593 'format': u'NA',
1594 'player_url': None,
1595 })
73f4e7af 1596 except UnavailableVideoError:
09cc744c 1597 self._downloader.trouble(u'\nERROR: unable to download video')
4135fa45 1598
c0a10ca8 1599
49c0028a 1600class GoogleIE(InfoExtractor):
1601 """Information extractor for video.google.com."""
1602
490fd7ae 1603 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
49c0028a 1604
1605 def __init__(self, downloader=None):
1606 InfoExtractor.__init__(self, downloader)
1607
1608 @staticmethod
1609 def suitable(url):
1610 return (re.match(GoogleIE._VALID_URL, url) is not None)
1611
1612 def report_download_webpage(self, video_id):
1613 """Report webpage download."""
331ce0a0 1614 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1615
1616 def report_extraction(self, video_id):
1617 """Report information extraction."""
331ce0a0 1618 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1619
1620 def _real_initialize(self):
1621 return
1622
1623 def _real_extract(self, url):
1624 # Extract id from URL
1625 mobj = re.match(self._VALID_URL, url)
1626 if mobj is None:
1627 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1628 return
1629
df372a65 1630 # At this point we have a new video
9bf7fa52 1631 self._downloader.increment_downloads()
49c0028a 1632 video_id = mobj.group(1)
1633
1634 video_extension = 'mp4'
1635
1636 # Retrieve video webpage to extract further information
490fd7ae 1637 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1638 try:
1639 self.report_download_webpage(video_id)
1640 webpage = urllib2.urlopen(request).read()
1641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1643 return
1644
1645 # Extract URL, uploader, and title from webpage
1646 self.report_extraction(video_id)
490fd7ae
RG
1647 mobj = re.search(r"download_url:'([^']+)'", webpage)
1648 if mobj is None:
1649 video_extension = 'flv'
1650 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1651 if mobj is None:
1652 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653 return
1654 mediaURL = urllib.unquote(mobj.group(1))
1655 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1656 mediaURL = mediaURL.replace('\\x26', '\x26')
1657
1658 video_url = mediaURL
1659
1660 mobj = re.search(r'<title>(.*)</title>', webpage)
1661 if mobj is None:
1662 self._downloader.trouble(u'ERROR: unable to extract title')
1663 return
1664 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1665 video_title = sanitize_title(video_title)
31cbdaaf 1666 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1667
7e58d568
RG
1668 # Extract video description
1669 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1670 if mobj is None:
1671 self._downloader.trouble(u'ERROR: unable to extract video description')
1672 return
1673 video_description = mobj.group(1).decode('utf-8')
1674 if not video_description:
1675 video_description = 'No description available.'
1676
1677 # Extract video thumbnail
1678 if self._downloader.params.get('forcethumbnail', False):
1679 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1680 try:
1681 webpage = urllib2.urlopen(request).read()
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1684 return
1685 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1686 if mobj is None:
1687 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1688 return
1689 video_thumbnail = mobj.group(1)
1690 else: # we need something to pass to process_info
1691 video_thumbnail = ''
1692
49c0028a 1693 try:
1694 # Process video information
1695 self._downloader.process_info({
1696 'id': video_id.decode('utf-8'),
1697 'url': video_url.decode('utf-8'),
6ba562b0 1698 'uploader': u'NA',
138b11f3 1699 'upload_date': u'NA',
490fd7ae 1700 'title': video_title,
31cbdaaf 1701 'stitle': simple_title,
49c0028a 1702 'ext': video_extension.decode('utf-8'),
6ba562b0 1703 'format': u'NA',
e616ec0c 1704 'player_url': None,
49c0028a 1705 })
73f4e7af 1706 except UnavailableVideoError:
09cc744c 1707 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1708
1709
1710class PhotobucketIE(InfoExtractor):
1711 """Information extractor for photobucket.com."""
1712
1713 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1714
1715 def __init__(self, downloader=None):
1716 InfoExtractor.__init__(self, downloader)
1717
1718 @staticmethod
1719 def suitable(url):
1720 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1721
1722 def report_download_webpage(self, video_id):
1723 """Report webpage download."""
331ce0a0 1724 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1725
1726 def report_extraction(self, video_id):
1727 """Report information extraction."""
331ce0a0 1728 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1729
1730 def _real_initialize(self):
1731 return
1732
1733 def _real_extract(self, url):
1734 # Extract id from URL
1735 mobj = re.match(self._VALID_URL, url)
1736 if mobj is None:
1737 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1738 return
1739
df372a65 1740 # At this point we have a new video
9bf7fa52 1741 self._downloader.increment_downloads()
49c0028a 1742 video_id = mobj.group(1)
1743
1744 video_extension = 'flv'
1745
1746 # Retrieve video webpage to extract further information
1747 request = urllib2.Request(url)
1748 try:
1749 self.report_download_webpage(video_id)
1750 webpage = urllib2.urlopen(request).read()
1751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1753 return
1754
1755 # Extract URL, uploader, and title from webpage
1756 self.report_extraction(video_id)
1757 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1758 if mobj is None:
1759 self._downloader.trouble(u'ERROR: unable to extract media URL')
1760 return
1761 mediaURL = urllib.unquote(mobj.group(1))
1762
1763 video_url = mediaURL
1764
1765 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1766 if mobj is None:
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1768 return
1769 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1770 video_title = sanitize_title(video_title)
31cbdaaf 1771 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1772
1773 video_uploader = mobj.group(2).decode('utf-8')
1774
1775 try:
1776 # Process video information
1777 self._downloader.process_info({
1778 'id': video_id.decode('utf-8'),
1779 'url': video_url.decode('utf-8'),
490fd7ae 1780 'uploader': video_uploader,
138b11f3 1781 'upload_date': u'NA',
490fd7ae 1782 'title': video_title,
31cbdaaf 1783 'stitle': simple_title,
490fd7ae 1784 'ext': video_extension.decode('utf-8'),
6ba562b0 1785 'format': u'NA',
e616ec0c 1786 'player_url': None,
490fd7ae 1787 })
73f4e7af 1788 except UnavailableVideoError:
09cc744c 1789 self._downloader.trouble(u'\nERROR: unable to download video')
490fd7ae
RG
1790
1791
61945318
RG
1792class YahooIE(InfoExtractor):
1793 """Information extractor for video.yahoo.com."""
1794
1795 # _VALID_URL matches all Yahoo! Video URLs
1796 # _VPAGE_URL matches only the extractable '/watch/' URLs
1797 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1798 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1799
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1802
1803 @staticmethod
1804 def suitable(url):
1805 return (re.match(YahooIE._VALID_URL, url) is not None)
1806
1807 def report_download_webpage(self, video_id):
1808 """Report webpage download."""
331ce0a0 1809 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1810
1811 def report_extraction(self, video_id):
1812 """Report information extraction."""
331ce0a0 1813 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318
RG
1814
1815 def _real_initialize(self):
1816 return
1817
df372a65 1818 def _real_extract(self, url, new_video=True):
61945318
RG
1819 # Extract ID from URL
1820 mobj = re.match(self._VALID_URL, url)
1821 if mobj is None:
1822 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1823 return
1824
df372a65 1825 # At this point we have a new video
9bf7fa52 1826 self._downloader.increment_downloads()
61945318
RG
1827 video_id = mobj.group(2)
1828 video_extension = 'flv'
1829
1830 # Rewrite valid but non-extractable URLs as
1831 # extractable English language /watch/ URLs
1832 if re.match(self._VPAGE_URL, url) is None:
1833 request = urllib2.Request(url)
1834 try:
1835 webpage = urllib2.urlopen(request).read()
1836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1837 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1838 return
1839
1840 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1841 if mobj is None:
1842 self._downloader.trouble(u'ERROR: Unable to extract id field')
1843 return
1844 yahoo_id = mobj.group(1)
1845
1846 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1847 if mobj is None:
1848 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1849 return
1850 yahoo_vid = mobj.group(1)
1851
1852 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1853 return self._real_extract(url, new_video=False)
61945318
RG
1854
1855 # Retrieve video webpage to extract further information
1856 request = urllib2.Request(url)
1857 try:
1858 self.report_download_webpage(video_id)
1859 webpage = urllib2.urlopen(request).read()
1860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1862 return
1863
1864 # Extract uploader and title from webpage
1865 self.report_extraction(video_id)
1866 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1867 if mobj is None:
1868 self._downloader.trouble(u'ERROR: unable to extract video title')
1869 return
1870 video_title = mobj.group(1).decode('utf-8')
1871 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1872
1873 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1874 if mobj is None:
1875 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1876 return
1877 video_uploader = mobj.group(1).decode('utf-8')
1878
7e58d568
RG
1879 # Extract video thumbnail
1880 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1881 if mobj is None:
1882 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1883 return
1884 video_thumbnail = mobj.group(1).decode('utf-8')
1885
1886 # Extract video description
1887 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1888 if mobj is None:
1889 self._downloader.trouble(u'ERROR: unable to extract video description')
1890 return
1891 video_description = mobj.group(1).decode('utf-8')
c0a10ca8
F
1892 if not video_description:
1893 video_description = 'No description available.'
7e58d568 1894
61945318
RG
1895 # Extract video height and width
1896 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1897 if mobj is None:
1898 self._downloader.trouble(u'ERROR: unable to extract video height')
1899 return
1900 yv_video_height = mobj.group(1)
1901
1902 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1903 if mobj is None:
1904 self._downloader.trouble(u'ERROR: unable to extract video width')
1905 return
1906 yv_video_width = mobj.group(1)
1907
1908 # Retrieve video playlist to extract media URL
1909 # I'm not completely sure what all these options are, but we
1910 # seem to need most of them, otherwise the server sends a 401.
1911 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1912 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1913 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
c0a10ca8
F
1914 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1915 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
61945318
RG
1916 try:
1917 self.report_download_webpage(video_id)
1918 webpage = urllib2.urlopen(request).read()
1919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921 return
1922
1923 # Extract media URL from playlist XML
1924 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1925 if mobj is None:
1926 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1927 return
1928 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1929 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1930
1931 try:
1932 # Process video information
1933 self._downloader.process_info({
1934 'id': video_id.decode('utf-8'),
1935 'url': video_url,
1936 'uploader': video_uploader,
138b11f3 1937 'upload_date': u'NA',
61945318
RG
1938 'title': video_title,
1939 'stitle': simple_title,
1940 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1941 'thumbnail': video_thumbnail.decode('utf-8'),
1942 'description': video_description,
1943 'thumbnail': video_thumbnail,
e616ec0c 1944 'player_url': None,
61945318 1945 })
73f4e7af 1946 except UnavailableVideoError:
09cc744c 1947 self._downloader.trouble(u'\nERROR: unable to download video')
61945318
RG
1948
1949
92743d42
RB
1950class VimeoIE(InfoExtractor):
1951 """Information extractor for vimeo.com."""
1952
1953 # _VALID_URL matches Vimeo URLs
44c636df 1954 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
92743d42
RB
1955
1956 def __init__(self, downloader=None):
1957 InfoExtractor.__init__(self, downloader)
1958
1959 @staticmethod
1960 def suitable(url):
1961 return (re.match(VimeoIE._VALID_URL, url) is not None)
1962
1963 def report_download_webpage(self, video_id):
1964 """Report webpage download."""
0ecedbdb 1965 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
92743d42
RB
1966
1967 def report_extraction(self, video_id):
1968 """Report information extraction."""
0ecedbdb 1969 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
92743d42
RB
1970
1971 def _real_initialize(self):
1972 return
1973
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1977 if mobj is None:
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979 return
1980
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(1)
92743d42
RB
1984
1985 # Retrieve video webpage to extract further information
1986 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1987 try:
1988 self.report_download_webpage(video_id)
1989 webpage = urllib2.urlopen(request).read()
1990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1992 return
1993
f24c674b
RB
1994 # Now we begin extracting as much information as we can from what we
1995 # retrieved. First we extract the information common to all extractors,
1996 # and latter we extract those that are Vimeo specific.
92743d42 1997 self.report_extraction(video_id)
f24c674b
RB
1998
1999 # Extract title
c5a088d3 2000 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
92743d42
RB
2001 if mobj is None:
2002 self._downloader.trouble(u'ERROR: unable to extract video title')
2003 return
2004 video_title = mobj.group(1).decode('utf-8')
2005 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2006
f24c674b 2007 # Extract uploader
c5a088d3 2008 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
92743d42
RB
2009 if mobj is None:
2010 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2011 return
2012 video_uploader = mobj.group(1).decode('utf-8')
2013
2014 # Extract video thumbnail
c5a088d3 2015 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
92743d42
RB
2016 if mobj is None:
2017 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2018 return
2019 video_thumbnail = mobj.group(1).decode('utf-8')
2020
2021 # # Extract video description
2022 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2023 # if mobj is None:
2024 # self._downloader.trouble(u'ERROR: unable to extract video description')
2025 # return
2026 # video_description = mobj.group(1).decode('utf-8')
2027 # if not video_description: video_description = 'No description available.'
2028 video_description = 'Foo.'
2029
f24c674b 2030 # Vimeo specific: extract request signature
c5a088d3 2031 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
92743d42
RB
2032 if mobj is None:
2033 self._downloader.trouble(u'ERROR: unable to extract request signature')
2034 return
2035 sig = mobj.group(1).decode('utf-8')
2036
f24c674b 2037 # Vimeo specific: Extract request signature expiration
c5a088d3 2038 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
92743d42
RB
2039 if mobj is None:
2040 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2041 return
2042 sig_exp = mobj.group(1).decode('utf-8')
2043
2044 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2045
2046 try:
2047 # Process video information
2048 self._downloader.process_info({
2049 'id': video_id.decode('utf-8'),
2050 'url': video_url,
2051 'uploader': video_uploader,
2052 'upload_date': u'NA',
2053 'title': video_title,
2054 'stitle': simple_title,
2fc31a48 2055 'ext': u'mp4',
92743d42
RB
2056 'thumbnail': video_thumbnail.decode('utf-8'),
2057 'description': video_description,
2058 'thumbnail': video_thumbnail,
2059 'description': video_description,
2060 'player_url': None,
2061 })
2062 except UnavailableVideoError:
2063 self._downloader.trouble(u'ERROR: unable to download video')
2064
2065
490fd7ae
RG
2066class GenericIE(InfoExtractor):
2067 """Generic last-resort information extractor."""
2068
2069 def __init__(self, downloader=None):
2070 InfoExtractor.__init__(self, downloader)
2071
2072 @staticmethod
2073 def suitable(url):
2074 return True
2075
2076 def report_download_webpage(self, video_id):
2077 """Report webpage download."""
331ce0a0
RG
2078 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2079 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
2080
2081 def report_extraction(self, video_id):
2082 """Report information extraction."""
331ce0a0 2083 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae
RG
2084
2085 def _real_initialize(self):
2086 return
2087
2088 def _real_extract(self, url):
df372a65 2089 # At this point we have a new video
9bf7fa52 2090 self._downloader.increment_downloads()
df372a65 2091
490fd7ae
RG
2092 video_id = url.split('/')[-1]
2093 request = urllib2.Request(url)
2094 try:
2095 self.report_download_webpage(video_id)
2096 webpage = urllib2.urlopen(request).read()
2097 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2098 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2099 return
2100 except ValueError, err:
2101 # since this is the last-resort InfoExtractor, if
2102 # this error is thrown, it'll be thrown here
2103 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2104 return
2105
a9806fd8 2106 self.report_extraction(video_id)
490fd7ae
RG
2107 # Start with something easy: JW Player in SWFObject
2108 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2109 if mobj is None:
2110 # Broaden the search a little bit
2111 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2112 if mobj is None:
2113 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2114 return
2115
2116 # It's possible that one of the regexes
2117 # matched, but returned an empty group:
2118 if mobj.group(1) is None:
2119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120 return
2121
2122 video_url = urllib.unquote(mobj.group(1))
c0a10ca8 2123 video_id = os.path.basename(video_url)
490fd7ae
RG
2124
2125 # here's a fun little line of code for you:
2126 video_extension = os.path.splitext(video_id)[1][1:]
c0a10ca8 2127 video_id = os.path.splitext(video_id)[0]
490fd7ae
RG
2128
2129 # it's tempting to parse this further, but you would
2130 # have to take into account all the variations like
2131 # Video Title - Site Name
2132 # Site Name | Video Title
2133 # Video Title - Tagline | Site Name
2134 # and so on and so forth; it's just not practical
2135 mobj = re.search(r'<title>(.*)</title>', webpage)
2136 if mobj is None:
2137 self._downloader.trouble(u'ERROR: unable to extract title')
2138 return
2139 video_title = mobj.group(1).decode('utf-8')
2140 video_title = sanitize_title(video_title)
31cbdaaf 2141 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
490fd7ae
RG
2142
2143 # video uploader is domain name
2144 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2145 if mobj is None:
2146 self._downloader.trouble(u'ERROR: unable to extract title')
2147 return
2148 video_uploader = mobj.group(1).decode('utf-8')
2149
2150 try:
2151 # Process video information
2152 self._downloader.process_info({
2153 'id': video_id.decode('utf-8'),
2154 'url': video_url.decode('utf-8'),
2155 'uploader': video_uploader,
138b11f3 2156 'upload_date': u'NA',
490fd7ae 2157 'title': video_title,
31cbdaaf 2158 'stitle': simple_title,
49c0028a 2159 'ext': video_extension.decode('utf-8'),
6ba562b0 2160 'format': u'NA',
e616ec0c 2161 'player_url': None,
49c0028a 2162 })
73f4e7af 2163 except UnavailableVideoError, err:
09cc744c 2164 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 2165
2166
25af2bce
RG
2167class YoutubeSearchIE(InfoExtractor):
2168 """Information Extractor for YouTube search queries."""
2169 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2170 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2171 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 2172 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 2173 _youtube_ie = None
fd9288c3 2174 _max_youtube_results = 1000
25af2bce 2175
f995f712 2176 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
2177 InfoExtractor.__init__(self, downloader)
2178 self._youtube_ie = youtube_ie
d3975459 2179
25af2bce
RG
2180 @staticmethod
2181 def suitable(url):
2182 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2183
2184 def report_download_page(self, query, pagenum):
2185 """Report attempt to download playlist page with given number."""
490fd7ae 2186 query = query.decode(preferredencoding())
331ce0a0 2187 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
2188
2189 def _real_initialize(self):
2190 self._youtube_ie.initialize()
d3975459 2191
25af2bce
RG
2192 def _real_extract(self, query):
2193 mobj = re.match(self._VALID_QUERY, query)
2194 if mobj is None:
147753eb 2195 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 2196 return
25af2bce
RG
2197
2198 prefix, query = query.split(':')
2199 prefix = prefix[8:]
c0a10ca8 2200 query = query.encode('utf-8')
f995f712 2201 if prefix == '':
6f21f686
RG
2202 self._download_n_results(query, 1)
2203 return
f995f712 2204 elif prefix == 'all':
6f21f686
RG
2205 self._download_n_results(query, self._max_youtube_results)
2206 return
f995f712 2207 else:
25af2bce 2208 try:
e1f18b8a 2209 n = long(prefix)
25af2bce 2210 if n <= 0:
147753eb 2211 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 2212 return
257453b9 2213 elif n > self._max_youtube_results:
c0a10ca8 2214 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 2215 n = self._max_youtube_results
6f21f686
RG
2216 self._download_n_results(query, n)
2217 return
e1f18b8a 2218 except ValueError: # parsing prefix as integer fails
6f21f686
RG
2219 self._download_n_results(query, 1)
2220 return
25af2bce
RG
2221
2222 def _download_n_results(self, query, n):
2223 """Downloads a specified number of results for a query"""
2224
2225 video_ids = []
2226 already_seen = set()
2227 pagenum = 1
2228
2229 while True:
2230 self.report_download_page(query, pagenum)
a9633f14 2231 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2232 request = urllib2.Request(result_url)
25af2bce
RG
2233 try:
2234 page = urllib2.urlopen(request).read()
2235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2236 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2237 return
25af2bce
RG
2238
2239 # Extract video identifiers
2240 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2241 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2242 if video_id not in already_seen:
2243 video_ids.append(video_id)
2244 already_seen.add(video_id)
2245 if len(video_ids) == n:
2246 # Specified n videos reached
25af2bce 2247 for id in video_ids:
6f21f686
RG
2248 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2249 return
25af2bce 2250
304a4d85 2251 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 2252 for id in video_ids:
6f21f686
RG
2253 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2254 return
25af2bce
RG
2255
2256 pagenum = pagenum + 1
2257
c0a10ca8 2258
7e58d568
RG
2259class GoogleSearchIE(InfoExtractor):
2260 """Information Extractor for Google Video search queries."""
2261 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2262 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2263 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2264 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2265 _google_ie = None
2266 _max_google_results = 1000
2267
2268 def __init__(self, google_ie, downloader=None):
2269 InfoExtractor.__init__(self, downloader)
2270 self._google_ie = google_ie
d3975459 2271
7e58d568
RG
2272 @staticmethod
2273 def suitable(url):
2274 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2275
2276 def report_download_page(self, query, pagenum):
2277 """Report attempt to download playlist page with given number."""
2278 query = query.decode(preferredencoding())
331ce0a0 2279 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2280
2281 def _real_initialize(self):
2282 self._google_ie.initialize()
d3975459 2283
7e58d568
RG
2284 def _real_extract(self, query):
2285 mobj = re.match(self._VALID_QUERY, query)
2286 if mobj is None:
2287 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2288 return
2289
2290 prefix, query = query.split(':')
2291 prefix = prefix[8:]
c0a10ca8 2292 query = query.encode('utf-8')
7e58d568
RG
2293 if prefix == '':
2294 self._download_n_results(query, 1)
2295 return
2296 elif prefix == 'all':
2297 self._download_n_results(query, self._max_google_results)
2298 return
2299 else:
2300 try:
2301 n = long(prefix)
2302 if n <= 0:
2303 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2304 return
2305 elif n > self._max_google_results:
c0a10ca8 2306 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
7e58d568
RG
2307 n = self._max_google_results
2308 self._download_n_results(query, n)
2309 return
2310 except ValueError: # parsing prefix as integer fails
2311 self._download_n_results(query, 1)
2312 return
2313
2314 def _download_n_results(self, query, n):
2315 """Downloads a specified number of results for a query"""
2316
2317 video_ids = []
2318 already_seen = set()
2319 pagenum = 1
2320
2321 while True:
2322 self.report_download_page(query, pagenum)
2323 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2324 request = urllib2.Request(result_url)
7e58d568
RG
2325 try:
2326 page = urllib2.urlopen(request).read()
2327 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2328 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2329 return
2330
2331 # Extract video identifiers
2332 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2333 video_id = mobj.group(1)
2334 if video_id not in already_seen:
2335 video_ids.append(video_id)
2336 already_seen.add(video_id)
2337 if len(video_ids) == n:
2338 # Specified n videos reached
2339 for id in video_ids:
2340 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2341 return
2342
2343 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2344 for id in video_ids:
2345 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2346 return
2347
2348 pagenum = pagenum + 1
2349
c0a10ca8 2350
7e58d568
RG
2351class YahooSearchIE(InfoExtractor):
2352 """Information Extractor for Yahoo! Video search queries."""
2353 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2354 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2355 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2356 _MORE_PAGES_INDICATOR = r'\s*Next'
2357 _yahoo_ie = None
2358 _max_yahoo_results = 1000
2359
2360 def __init__(self, yahoo_ie, downloader=None):
2361 InfoExtractor.__init__(self, downloader)
2362 self._yahoo_ie = yahoo_ie
d3975459 2363
7e58d568
RG
2364 @staticmethod
2365 def suitable(url):
2366 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2367
2368 def report_download_page(self, query, pagenum):
2369 """Report attempt to download playlist page with given number."""
2370 query = query.decode(preferredencoding())
331ce0a0 2371 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2372
2373 def _real_initialize(self):
2374 self._yahoo_ie.initialize()
d3975459 2375
7e58d568
RG
2376 def _real_extract(self, query):
2377 mobj = re.match(self._VALID_QUERY, query)
2378 if mobj is None:
2379 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2380 return
2381
2382 prefix, query = query.split(':')
2383 prefix = prefix[8:]
c0a10ca8 2384 query = query.encode('utf-8')
7e58d568
RG
2385 if prefix == '':
2386 self._download_n_results(query, 1)
2387 return
2388 elif prefix == 'all':
2389 self._download_n_results(query, self._max_yahoo_results)
2390 return
2391 else:
2392 try:
2393 n = long(prefix)
2394 if n <= 0:
2395 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2396 return
2397 elif n > self._max_yahoo_results:
c0a10ca8 2398 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
7e58d568
RG
2399 n = self._max_yahoo_results
2400 self._download_n_results(query, n)
2401 return
2402 except ValueError: # parsing prefix as integer fails
2403 self._download_n_results(query, 1)
2404 return
2405
2406 def _download_n_results(self, query, n):
2407 """Downloads a specified number of results for a query"""
2408
2409 video_ids = []
2410 already_seen = set()
2411 pagenum = 1
2412
2413 while True:
2414 self.report_download_page(query, pagenum)
2415 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2416 request = urllib2.Request(result_url)
7e58d568
RG
2417 try:
2418 page = urllib2.urlopen(request).read()
2419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2420 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2421 return
2422
2423 # Extract video identifiers
2424 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2425 video_id = mobj.group(1)
2426 if video_id not in already_seen:
2427 video_ids.append(video_id)
2428 already_seen.add(video_id)
2429 if len(video_ids) == n:
2430 # Specified n videos reached
2431 for id in video_ids:
2432 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2433 return
2434
2435 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2436 for id in video_ids:
2437 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2438 return
2439
2440 pagenum = pagenum + 1
2441
c0a10ca8 2442
0c2dc87d
RG
2443class YoutubePlaylistIE(InfoExtractor):
2444 """Information Extractor for YouTube playlists."""
2445
2152ee86 2446 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
f74e22ae 2447 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
0c2dc87d 2448 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 2449 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d
RG
2450 _youtube_ie = None
2451
2452 def __init__(self, youtube_ie, downloader=None):
2453 InfoExtractor.__init__(self, downloader)
2454 self._youtube_ie = youtube_ie
d3975459 2455
0c2dc87d
RG
2456 @staticmethod
2457 def suitable(url):
2458 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2459
2460 def report_download_page(self, playlist_id, pagenum):
2461 """Report attempt to download playlist page with given number."""
331ce0a0 2462 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
2463
2464 def _real_initialize(self):
2465 self._youtube_ie.initialize()
d3975459 2466
0c2dc87d
RG
2467 def _real_extract(self, url):
2468 # Extract playlist id
2469 mobj = re.match(self._VALID_URL, url)
2470 if mobj is None:
147753eb 2471 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 2472 return
0c2dc87d 2473
d119b54d
RG
2474 # Single video case
2475 if mobj.group(3) is not None:
2476 self._youtube_ie.extract(mobj.group(3))
2477 return
2478
0c2dc87d 2479 # Download playlist pages
f74e22ae
GI
2480 # prefix is 'p' as default for playlists but there are other types that need extra care
2481 playlist_prefix = mobj.group(1)
2482 if playlist_prefix == 'a':
2483 playlist_access = 'artist'
2484 else:
7cc3c6fd 2485 playlist_prefix = 'p'
f74e22ae
GI
2486 playlist_access = 'view_play_list'
2487 playlist_id = mobj.group(2)
0c2dc87d
RG
2488 video_ids = []
2489 pagenum = 1
2490
2491 while True:
2492 self.report_download_page(playlist_id, pagenum)
f74e22ae 2493 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
0c2dc87d
RG
2494 try:
2495 page = urllib2.urlopen(request).read()
2496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2497 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2498 return
0c2dc87d
RG
2499
2500 # Extract video identifiers
27d98b6e 2501 ids_in_page = []
0c2dc87d 2502 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
2503 if mobj.group(1) not in ids_in_page:
2504 ids_in_page.append(mobj.group(1))
2505 video_ids.extend(ids_in_page)
0c2dc87d 2506
ce5cafea 2507 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
2508 break
2509 pagenum = pagenum + 1
2510
8cc44341
RG
2511 playliststart = self._downloader.params.get('playliststart', 1) - 1
2512 playlistend = self._downloader.params.get('playlistend', -1)
2513 video_ids = video_ids[playliststart:playlistend]
2514
0c2dc87d 2515 for id in video_ids:
6f21f686
RG
2516 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2517 return
0c2dc87d 2518
c0a10ca8 2519
c39c05cd
A
2520class YoutubeUserIE(InfoExtractor):
2521 """Information Extractor for YouTube users."""
2522
5aba6ea4 2523 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
c39c05cd 2524 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
5aba6ea4
RG
2525 _GDATA_PAGE_SIZE = 50
2526 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2527 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
c39c05cd
A
2528 _youtube_ie = None
2529
2530 def __init__(self, youtube_ie, downloader=None):
2531 InfoExtractor.__init__(self, downloader)
2532 self._youtube_ie = youtube_ie
d3975459 2533
c39c05cd
A
2534 @staticmethod
2535 def suitable(url):
2536 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2537
5aba6ea4 2538 def report_download_page(self, username, start_index):
c39c05cd 2539 """Report attempt to download user page."""
5aba6ea4 2540 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
c0a10ca8 2541 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
c39c05cd
A
2542
2543 def _real_initialize(self):
2544 self._youtube_ie.initialize()
d3975459 2545
c39c05cd
A
2546 def _real_extract(self, url):
2547 # Extract username
2548 mobj = re.match(self._VALID_URL, url)
2549 if mobj is None:
2550 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2551 return
2552
c39c05cd 2553 username = mobj.group(1)
5aba6ea4
RG
2554
2555 # Download video ids using YouTube Data API. Result size per
2556 # query is limited (currently to 50 videos) so we need to query
2557 # page by page until there are no video ids - it means we got
2558 # all of them.
2559
c39c05cd 2560 video_ids = []
5aba6ea4 2561 pagenum = 0
c39c05cd 2562
5aba6ea4
RG
2563 while True:
2564 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2565 self.report_download_page(username, start_index)
c39c05cd 2566
5aba6ea4 2567 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
c39c05cd 2568
5aba6ea4
RG
2569 try:
2570 page = urllib2.urlopen(request).read()
2571 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2572 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2573 return
c39c05cd 2574
5aba6ea4
RG
2575 # Extract video identifiers
2576 ids_in_page = []
2577
2578 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2579 if mobj.group(1) not in ids_in_page:
2580 ids_in_page.append(mobj.group(1))
2581
2582 video_ids.extend(ids_in_page)
2583
2584 # A little optimization - if current page is not
2585 # "full", ie. does not contain PAGE_SIZE video ids then
2586 # we can assume that this page is the last one - there
2587 # are no more ids on further pages - no need to query
2588 # again.
2589
2590 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2591 break
2592
2593 pagenum += 1
2594
2595 all_ids_count = len(video_ids)
8cc44341
RG
2596 playliststart = self._downloader.params.get('playliststart', 1) - 1
2597 playlistend = self._downloader.params.get('playlistend', -1)
204c9398 2598
5aba6ea4
RG
2599 if playlistend == -1:
2600 video_ids = video_ids[playliststart:]
2601 else:
2602 video_ids = video_ids[playliststart:playlistend]
7a9054ec 2603
5aba6ea4 2604 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
c0a10ca8 2605 (username, all_ids_count, len(video_ids)))
5aba6ea4
RG
2606
2607 for video_id in video_ids:
2608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2609
c39c05cd 2610
27179cfd
VV
2611class DepositFilesIE(InfoExtractor):
2612 """Information extractor for depositfiles.com"""
2613
2614 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2615
2616 def __init__(self, downloader=None):
2617 InfoExtractor.__init__(self, downloader)
2618
2619 @staticmethod
2620 def suitable(url):
2621 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2622
2623 def report_download_webpage(self, file_id):
2624 """Report webpage download."""
2625 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2626
2627 def report_extraction(self, file_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2630
2631 def _real_initialize(self):
2632 return
2633
2634 def _real_extract(self, url):
2635 # At this point we have a new file
2636 self._downloader.increment_downloads()
2637
2638 file_id = url.split('/')[-1]
2639 # Rebuild url in english locale
2640 url = 'http://depositfiles.com/en/files/' + file_id
2641
2642 # Retrieve file webpage with 'Free download' button pressed
2643 free_download_indication = { 'gateway_result' : '1' }
1987c232 2644 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
27179cfd
VV
2645 try:
2646 self.report_download_webpage(file_id)
2647 webpage = urllib2.urlopen(request).read()
2648 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2649 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2650 return
2651
2652 # Search for the real file URL
2653 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2654 if (mobj is None) or (mobj.group(1) is None):
2655 # Try to figure out reason of the error.
2656 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2657 if (mobj is not None) and (mobj.group(1) is not None):
2658 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2659 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2660 else:
2661 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2662 return
2663
2664 file_url = mobj.group(1)
2665 file_extension = os.path.splitext(file_url)[1][1:]
2666
2667 # Search for file title
2668 mobj = re.search(r'<b title="(.*?)">', webpage)
2669 if mobj is None:
2670 self._downloader.trouble(u'ERROR: unable to extract title')
2671 return
2672 file_title = mobj.group(1).decode('utf-8')
2673
2674 try:
2675 # Process file information
2676 self._downloader.process_info({
2677 'id': file_id.decode('utf-8'),
2678 'url': file_url.decode('utf-8'),
2679 'uploader': u'NA',
2680 'upload_date': u'NA',
2681 'title': file_title,
2682 'stitle': file_title,
2683 'ext': file_extension.decode('utf-8'),
2684 'format': u'NA',
2685 'player_url': None,
2686 })
2687 except UnavailableVideoError, err:
2688 self._downloader.trouble(u'ERROR: unable to download file')
2689
c0a10ca8 2690
9f5f9602
GI
2691class FacebookIE(InfoExtractor):
2692 """Information Extractor for Facebook"""
2693
2694 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2695 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2696 _NETRC_MACHINE = 'facebook'
2697 _available_formats = ['highqual', 'lowqual']
2698 _video_extensions = {
2699 'highqual': 'mp4',
2700 'lowqual': 'mp4',
2701 }
2702
2703 def __init__(self, downloader=None):
2704 InfoExtractor.__init__(self, downloader)
2705
2706 @staticmethod
2707 def suitable(url):
2708 return (re.match(FacebookIE._VALID_URL, url) is not None)
2709
2710 def _reporter(self, message):
2711 """Add header and report message."""
2712 self._downloader.to_screen(u'[facebook] %s' % message)
2713
2714 def report_login(self):
2715 """Report attempt to log in."""
2716 self._reporter(u'Logging in')
2717
2718 def report_video_webpage_download(self, video_id):
2719 """Report attempt to download video webpage."""
2720 self._reporter(u'%s: Downloading video webpage' % video_id)
2721
2722 def report_information_extraction(self, video_id):
2723 """Report attempt to extract video information."""
2724 self._reporter(u'%s: Extracting video information' % video_id)
2725
2726 def _parse_page(self, video_webpage):
2727 """Extract video information from page"""
2728 # General data
2729 data = {'title': r'class="video_title datawrap">(.*?)</',
2730 'description': r'<div class="datawrap">(.*?)</div>',
2731 'owner': r'\("video_owner_name", "(.*?)"\)',
2732 'upload_date': r'data-date="(.*?)"',
2733 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2734 }
2735 video_info = {}
2736 for piece in data.keys():
2737 mobj = re.search(data[piece], video_webpage)
2738 if mobj is not None:
2739 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2740
2741 # Video urls
2742 video_urls = {}
2743 for fmt in self._available_formats:
2744 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2745 if mobj is not None:
2746 # URL is in a Javascript segment inside an escaped Unicode format within
2747 # the generally utf-8 page
2748 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2749 video_info['video_urls'] = video_urls
2750
2751 return video_info
2752
2753 def _real_initialize(self):
2754 if self._downloader is None:
2755 return
2756
2757 useremail = None
2758 password = None
2759 downloader_params = self._downloader.params
2760
2761 # Attempt to use provided username and password or .netrc data
2762 if downloader_params.get('username', None) is not None:
2763 useremail = downloader_params['username']
2764 password = downloader_params['password']
2765 elif downloader_params.get('usenetrc', False):
2766 try:
2767 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2768 if info is not None:
2769 useremail = info[0]
2770 password = info[2]
2771 else:
2772 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2773 except (IOError, netrc.NetrcParseError), err:
2774 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2775 return
2776
2777 if useremail is None:
2778 return
2779
2780 # Log in
2781 login_form = {
2782 'email': useremail,
2783 'pass': password,
2784 'login': 'Log+In'
2785 }
2786 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2787 try:
2788 self.report_login()
2789 login_results = urllib2.urlopen(request).read()
2790 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2791 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2792 return
2793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2795 return
2796
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2799 if mobj is None:
2800 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801 return
2802 video_id = mobj.group('ID')
2803
2804 # Get video webpage
2805 self.report_video_webpage_download(video_id)
2806 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2807 try:
2808 page = urllib2.urlopen(request)
2809 video_webpage = page.read()
2810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2812 return
2813
2814 # Start extracting information
2815 self.report_information_extraction(video_id)
2816
2817 # Extract information
2818 video_info = self._parse_page(video_webpage)
2819
2820 # uploader
2821 if 'owner' not in video_info:
2822 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2823 return
2824 video_uploader = video_info['owner']
2825
2826 # title
2827 if 'title' not in video_info:
2828 self._downloader.trouble(u'ERROR: unable to extract video title')
2829 return
2830 video_title = video_info['title']
2831 video_title = video_title.decode('utf-8')
2832 video_title = sanitize_title(video_title)
2833
2834 # simplified title
2835 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2836 simple_title = simple_title.strip(ur'_')
2837
2838 # thumbnail image
2839 if 'thumbnail' not in video_info:
2840 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2841 video_thumbnail = ''
2842 else:
2843 video_thumbnail = video_info['thumbnail']
2844
2845 # upload date
2846 upload_date = u'NA'
2847 if 'upload_date' in video_info:
2848 upload_time = video_info['upload_date']
2849 timetuple = email.utils.parsedate_tz(upload_time)
2850 if timetuple is not None:
2851 try:
2852 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2853 except:
2854 pass
2855
2856 # description
8b95c387 2857 video_description = video_info.get('description', 'No description available.')
9f5f9602
GI
2858
2859 url_map = video_info['video_urls']
2860 if len(url_map.keys()) > 0:
2861 # Decide which formats to download
2862 req_format = self._downloader.params.get('format', None)
2863 format_limit = self._downloader.params.get('format_limit', None)
2864
2865 if format_limit is not None and format_limit in self._available_formats:
2866 format_list = self._available_formats[self._available_formats.index(format_limit):]
2867 else:
2868 format_list = self._available_formats
2869 existing_formats = [x for x in format_list if x in url_map]
2870 if len(existing_formats) == 0:
2871 self._downloader.trouble(u'ERROR: no known formats available for video')
2872 return
2873 if req_format is None:
2874 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2875 elif req_format == '-1':
2876 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2877 else:
2878 # Specific format
2879 if req_format not in url_map:
2880 self._downloader.trouble(u'ERROR: requested format not available')
2881 return
2882 video_url_list = [(req_format, url_map[req_format])] # Specific format
2883
2884 for format_param, video_real_url in video_url_list:
2885
2886 # At this point we have a new video
2887 self._downloader.increment_downloads()
2888
2889 # Extension
2890 video_extension = self._video_extensions.get(format_param, 'mp4')
2891
9f5f9602
GI
2892 try:
2893 # Process video information
2894 self._downloader.process_info({
2895 'id': video_id.decode('utf-8'),
2896 'url': video_real_url.decode('utf-8'),
2897 'uploader': video_uploader.decode('utf-8'),
2898 'upload_date': upload_date,
2899 'title': video_title,
2900 'stitle': simple_title,
2901 'ext': video_extension.decode('utf-8'),
2902 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2903 'thumbnail': video_thumbnail.decode('utf-8'),
2904 'description': video_description.decode('utf-8'),
2905 'player_url': None,
2906 })
2907 except UnavailableVideoError, err:
2908 self._downloader.trouble(u'\nERROR: unable to download video')
2909
7745f5d8
PH
2910class BlipTVIE(InfoExtractor):
2911 """Information extractor for blip.tv"""
2912
1cab2c6d 2913 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
7745f5d8
PH
2914 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2915
2916 @staticmethod
2917 def suitable(url):
2918 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2919
7745f5d8
PH
2920 def report_extraction(self, file_id):
2921 """Report information extraction."""
aded78d9 2922 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
7745f5d8
PH
2923
2924 def _simplify_title(self, title):
2925 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2926 res = res.strip(ur'_')
2927 return res
2928
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2931 if mobj is None:
2932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933 return
2934
1293ce58
PH
2935 if '?' in url:
2936 cchar = '&'
2937 else:
2938 cchar = '?'
2939 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
7745f5d8 2940 request = urllib2.Request(json_url)
aded78d9 2941 self.report_extraction(mobj.group(1))
7745f5d8
PH
2942 try:
2943 json_code = urllib2.urlopen(request).read()
2944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2945 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2946 return
2947 try:
2948 json_data = json.loads(json_code)
1293ce58
PH
2949 if 'Post' in json_data:
2950 data = json_data['Post']
2951 else:
2952 data = json_data
7745f5d8
PH
2953
2954 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2955 video_url = data['media']['url']
2956 umobj = re.match(self._URL_EXT, video_url)
2957 if umobj is None:
2958 raise ValueError('Can not determine filename extension')
2959 ext = umobj.group(1)
2960
a1cab7ce
PH
2961 self._downloader.increment_downloads()
2962
7745f5d8
PH
2963 info = {
2964 'id': data['item_id'],
2965 'url': video_url,
2966 'uploader': data['display_name'],
2967 'upload_date': upload_date,
2968 'title': data['title'],
2969 'stitle': self._simplify_title(data['title']),
2970 'ext': ext,
2971 'format': data['media']['mimeType'],
2972 'thumbnail': data['thumbnailUrl'],
2973 'description': data['description'],
2974 'player_url': data['embedUrl']
2975 }
2976 except (ValueError,KeyError), err:
aded78d9 2977 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
7745f5d8
PH
2978 return
2979
2980 try:
2981 self._downloader.process_info(info)
2982 except UnavailableVideoError, err:
2983 self._downloader.trouble(u'\nERROR: unable to download video')
2984
2985
9b0a8bc1
PH
2986class MyVideoIE(InfoExtractor):
2987 """Information Extractor for myvideo.de."""
2988
2989 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2990
2991 def __init__(self, downloader=None):
2992 InfoExtractor.__init__(self, downloader)
2993
2994 @staticmethod
2995 def suitable(url):
2996 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2997
2998 def report_download_webpage(self, video_id):
2999 """Report webpage download."""
3000 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3001
3002 def report_extraction(self, video_id):
3003 """Report information extraction."""
3004 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3005
3006 def _real_initialize(self):
3007 return
3008
3009 def _real_extract(self,url):
3010 mobj = re.match(self._VALID_URL, url)
3011 if mobj is None:
3012 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3013 return
3014
3015 video_id = mobj.group(1)
3016 simple_title = mobj.group(2).decode('utf-8')
3017 # should actually not be necessary
3018 simple_title = sanitize_title(simple_title)
3019 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3020
3021 # Get video webpage
3022 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3023 try:
3024 self.report_download_webpage(video_id)
3025 webpage = urllib2.urlopen(request).read()
3026 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3027 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3028 return
3029
3030 self.report_extraction(video_id)
3031 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3032 webpage)
3033 if mobj is None:
3034 self._downloader.trouble(u'ERROR: unable to extract media URL')
3035 return
3036 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3037
3038 mobj = re.search('<title>([^<]+)</title>', webpage)
3039 if mobj is None:
3040 self._downloader.trouble(u'ERROR: unable to extract title')
3041 return
3042
3043 video_title = mobj.group(1)
3044 video_title = sanitize_title(video_title)
3045
3046 try:
3047 print(video_url)
3048 self._downloader.process_info({
3049 'id': video_id,
3050 'url': video_url,
3051 'uploader': u'NA',
3052 'upload_date': u'NA',
3053 'title': video_title,
3054 'stitle': simple_title,
3055 'ext': u'flv',
3056 'format': u'NA',
3057 'player_url': None,
3058 })
3059 except UnavailableVideoError:
3060 self._downloader.trouble(u'\nERROR: Unable to download video')
3061
c8e30044 3062class ComedyCentralIE(InfoExtractor):
f166bccc 3063 """Information extractor for The Daily Show and Colbert Report """
c8e30044 3064
f166bccc 3065 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
c8e30044
PH
3066
3067 @staticmethod
3068 def suitable(url):
3069 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3070
3071 def report_extraction(self, episode_id):
3072 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3073
3074 def report_config_download(self, episode_id):
3075 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3076
b487ef08
PH
3077 def report_index_download(self, episode_id):
3078 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3079
fedf9f39
PH
3080 def report_player_url(self, episode_id):
3081 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3082
c8e30044
PH
3083 def _simplify_title(self, title):
3084 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3085 res = res.strip(ur'_')
3086 return res
3087
3088 def _real_extract(self, url):
3089 mobj = re.match(self._VALID_URL, url)
3090 if mobj is None:
3091 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3092 return
f166bccc
PH
3093
3094 if mobj.group('shortname'):
3095 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3096 url = 'http://www.thedailyshow.com/full-episodes/'
3097 else:
3098 url = 'http://www.colbertnation.com/full-episodes/'
3099 mobj = re.match(self._VALID_URL, url)
3100 assert mobj is not None
3101
3102 dlNewest = not mobj.group('episode')
3103 if dlNewest:
3104 epTitle = mobj.group('showname')
3105 else:
3106 epTitle = mobj.group('episode')
c8e30044
PH
3107
3108 req = urllib2.Request(url)
3109 self.report_extraction(epTitle)
3110 try:
f166bccc
PH
3111 htmlHandle = urllib2.urlopen(req)
3112 html = htmlHandle.read()
c8e30044
PH
3113 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3115 return
f166bccc
PH
3116 if dlNewest:
3117 url = htmlHandle.geturl()
3118 mobj = re.match(self._VALID_URL, url)
3119 if mobj is None:
3120 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3121 return
3122 if mobj.group('episode') == '':
3123 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3124 return
3125 epTitle = mobj.group('episode')
c8e30044 3126
b487ef08 3127 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
c8e30044
PH
3128 if len(mMovieParams) == 0:
3129 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3130 return
b487ef08
PH
3131
3132 playerUrl_raw = mMovieParams[0][0]
fedf9f39
PH
3133 self.report_player_url(epTitle)
3134 try:
b487ef08
PH
3135 urlHandle = urllib2.urlopen(playerUrl_raw)
3136 playerUrl = urlHandle.geturl()
3137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3138 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3139 return
3140
3141 uri = mMovieParams[0][1]
3142 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3143 self.report_index_download(epTitle)
3144 try:
3145 indexXml = urllib2.urlopen(indexUrl).read()
fedf9f39 3146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
b487ef08 3147 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
fedf9f39 3148 return
fedf9f39 3149
b487ef08
PH
3150 idoc = xml.etree.ElementTree.fromstring(indexXml)
3151 itemEls = idoc.findall('.//item')
3152 for itemEl in itemEls:
3153 mediaId = itemEl.findall('./guid')[0].text
3154 shortMediaId = mediaId.split(':')[-1]
3155 showId = mediaId.split(':')[-2].replace('.com', '')
3156 officialTitle = itemEl.findall('./title')[0].text
3157 officialDate = itemEl.findall('./pubDate')[0].text
3158
c8e30044
PH
3159 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3160 urllib.urlencode({'uri': mediaId}))
3161 configReq = urllib2.Request(configUrl)
3162 self.report_config_download(epTitle)
3163 try:
3164 configXml = urllib2.urlopen(configReq).read()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3167 return
46c8c432 3168
c8e30044
PH
3169 cdoc = xml.etree.ElementTree.fromstring(configXml)
3170 turls = []
3171 for rendition in cdoc.findall('.//rendition'):
3172 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3173 turls.append(finfo)
3174
a88bc6bb 3175 if len(turls) == 0:
b487ef08 3176 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
a88bc6bb
PH
3177 continue
3178
c8e30044
PH
3179 # For now, just pick the highest bitrate
3180 format,video_url = turls[-1]
3181
3182 self._downloader.increment_downloads()
a88bc6bb 3183
b487ef08 3184 effTitle = showId + '-' + epTitle
c8e30044 3185 info = {
b487ef08 3186 'id': shortMediaId,
c8e30044 3187 'url': video_url,
b487ef08
PH
3188 'uploader': showId,
3189 'upload_date': officialDate,
a88bc6bb
PH
3190 'title': effTitle,
3191 'stitle': self._simplify_title(effTitle),
c8e30044
PH
3192 'ext': 'mp4',
3193 'format': format,
3194 'thumbnail': None,
b487ef08
PH
3195 'description': officialTitle,
3196 'player_url': playerUrl
c8e30044 3197 }
46c8c432 3198
c8e30044
PH
3199 try:
3200 self._downloader.process_info(info)
3201 except UnavailableVideoError, err:
b487ef08 3202 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
a88bc6bb 3203 continue
c8e30044
PH
3204
3205
f9c68787
PH
3206class EscapistIE(InfoExtractor):
3207 """Information extractor for The Escapist """
3208
3209 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3210
3211 @staticmethod
3212 def suitable(url):
3213 return (re.match(EscapistIE._VALID_URL, url) is not None)
3214
3215 def report_extraction(self, showName):
3216 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3217
3218 def report_config_download(self, showName):
3219 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3220
3221 def _simplify_title(self, title):
3222 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3223 res = res.strip(ur'_')
3224 return res
3225
3226 def _real_extract(self, url):
3227 htmlParser = HTMLParser.HTMLParser()
3228
3229 mobj = re.match(self._VALID_URL, url)
3230 if mobj is None:
3231 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3232 return
3233 showName = mobj.group('showname')
3234 videoId = mobj.group('episode')
3235
3236 self.report_extraction(showName)
3237 try:
3238 webPage = urllib2.urlopen(url).read()
3239 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3241 return
3242
3243 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3244 description = htmlParser.unescape(descMatch.group(1))
3245 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3246 imgUrl = htmlParser.unescape(imgMatch.group(1))
3247 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3248 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3249 configUrlMatch = re.search('config=(.*)$', playerUrl)
3250 configUrl = urllib2.unquote(configUrlMatch.group(1))
3251
3252 self.report_config_download(showName)
3253 try:
3254 configJSON = urllib2.urlopen(configUrl).read()
3255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3257 return
3258
3259 # Technically, it's JavaScript, not JSON
3260 configJSON = configJSON.replace("'", '"')
3261
3262 try:
3263 config = json.loads(configJSON)
3264 except (ValueError,), err:
3265 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3266 return
3267
3268 playlist = config['playlist']
3269 videoUrl = playlist[1]['url']
3270
3271 self._downloader.increment_downloads()
3272 info = {
3273 'id': videoId,
3274 'url': videoUrl,
3275 'uploader': showName,
3276 'upload_date': None,
3277 'title': showName,
3278 'stitle': self._simplify_title(showName),
3279 'ext': 'flv',
3280 'format': 'flv',
3281 'thumbnail': imgUrl,
3282 'description': description,
3283 'player_url': playerUrl,
3284 }
3285
3286 try:
3287 self._downloader.process_info(info)
3288 except UnavailableVideoError, err:
3289 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3290
3291
3292
65cd34c5
RG
3293class PostProcessor(object):
3294 """Post Processor class.
3295
3296 PostProcessor objects can be added to downloaders with their
3297 add_post_processor() method. When the downloader has finished a
3298 successful download, it will take its internal chain of PostProcessors
3299 and start calling the run() method on each one of them, first with
3300 an initial argument and then with the returned value of the previous
3301 PostProcessor.
3302
3303 The chain will be stopped if one of them ever returns None or the end
3304 of the chain is reached.
3305
3306 PostProcessor objects follow a "mutual registration" process similar
3307 to InfoExtractor objects.
3308 """
3309
3310 _downloader = None
3311
3312 def __init__(self, downloader=None):
3313 self._downloader = downloader
3314
65cd34c5
RG
3315 def set_downloader(self, downloader):
3316 """Sets the downloader for this PP."""
3317 self._downloader = downloader
d3975459 3318
65cd34c5
RG
3319 def run(self, information):
3320 """Run the PostProcessor.
3321
3322 The "information" argument is a dictionary like the ones
2f11508a 3323 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
3324 one has an extra field called "filepath" that points to the
3325 downloaded file.
3326
3327 When this method returns None, the postprocessing chain is
3328 stopped. However, this method may return an information
3329 dictionary that will be passed to the next postprocessing
3330 object in the chain. It can be the one it received after
3331 changing some fields.
3332
3333 In addition, this method may raise a PostProcessingError
3334 exception that will be taken into account by the downloader
3335 it was called from.
3336 """
3337 return information # by default, do nothing
d3975459 3338
c0a10ca8 3339
3072fab1
RG
3340class FFmpegExtractAudioPP(PostProcessor):
3341
3342 def __init__(self, downloader=None, preferredcodec=None):
3343 PostProcessor.__init__(self, downloader)
3344 if preferredcodec is None:
3345 preferredcodec = 'best'
3346 self._preferredcodec = preferredcodec
3347
3348 @staticmethod
3349 def get_audio_codec(path):
da273188 3350 try:
2727dbf7
RG
3351 cmd = ['ffprobe', '-show_streams', '--', path]
3352 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
da273188
RG
3353 output = handle.communicate()[0]
3354 if handle.wait() != 0:
3355 return None
3356 except (IOError, OSError):
3072fab1
RG
3357 return None
3358 audio_codec = None
3359 for line in output.split('\n'):
3360 if line.startswith('codec_name='):
3361 audio_codec = line.split('=')[1].strip()
3362 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3363 return audio_codec
3364 return None
3365
3366 @staticmethod
3367 def run_ffmpeg(path, out_path, codec, more_opts):
3368 try:
2727dbf7
RG
3369 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3370 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3072fab1
RG
3371 return (ret == 0)
3372 except (IOError, OSError):
3373 return False
3374
3375 def run(self, information):
3376 path = information['filepath']
3377
3378 filecodec = self.get_audio_codec(path)
3379 if filecodec is None:
da273188 3380 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3072fab1
RG
3381 return None
3382
3383 more_opts = []
3384 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3385 if filecodec == 'aac' or filecodec == 'mp3':
3386 # Lossless if possible
3387 acodec = 'copy'
3388 extension = filecodec
3389 if filecodec == 'aac':
3390 more_opts = ['-f', 'adts']
3391 else:
3392 # MP3 otherwise.
3393 acodec = 'libmp3lame'
3394 extension = 'mp3'
3395 more_opts = ['-ab', '128k']
3396 else:
3397 # We convert the audio (lossy)
3398 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3399 extension = self._preferredcodec
3400 more_opts = ['-ab', '128k']
3401 if self._preferredcodec == 'aac':
3402 more_opts += ['-f', 'adts']
3403
3404 (prefix, ext) = os.path.splitext(path)
3405 new_path = prefix + '.' + extension
3406 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3407 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3408
3409 if not status:
1bd92582 3410 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3072fab1
RG
3411 return None
3412
3413 try:
3414 os.remove(path)
3415 except (IOError, OSError):
3416 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3417 return None
3418
3419 information['filepath'] = new_path
3420 return information
3421
5fb3df4a
GV
3422
3423def updateSelf(downloader, filename):
3424 ''' Update the program file with the latest version from the repository '''
3425 # Note: downloader only used for options
3426 if not os.access(filename, os.W_OK):
3427 sys.exit('ERROR: no write permissions on %s' % filename)
3428
d207e7cf 3429 downloader.to_screen('Updating to latest version...')
5fb3df4a 3430
4fa74b52 3431 try:
d207e7cf
PH
3432 try:
3433 urlh = urllib.urlopen(UPDATE_URL)
3434 newcontent = urlh.read()
3435 finally:
3436 urlh.close()
5fb3df4a
GV
3437 except (IOError, OSError), err:
3438 sys.exit('ERROR: unable to download latest version')
f9f1e798 3439
5fb3df4a 3440 try:
d207e7cf
PH
3441 outf = open(filename, 'wb')
3442 try:
3443 outf.write(newcontent)
3444 finally:
3445 outf.close()
5fb3df4a
GV
3446 except (IOError, OSError), err:
3447 sys.exit('ERROR: unable to overwrite current version')
4bec29ef 3448
d207e7cf 3449 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
80066952 3450
4f9f96f6
GV
3451def parseOpts():
3452 # Deferred imports
3453 import getpass
3454 import optparse
e7cf18cb 3455
4f9f96f6
GV
3456 def _format_option_string(option):
3457 ''' ('-o', '--option') -> -o, --format METAVAR'''
80066952 3458
4f9f96f6
GV
3459 opts = []
3460
3461 if option._short_opts: opts.append(option._short_opts[0])
3462 if option._long_opts: opts.append(option._long_opts[0])
3463 if len(opts) > 1: opts.insert(1, ', ')
3464
3465 if option.takes_value(): opts.append(' %s' % option.metavar)
3466
3467 return "".join(opts)
3468
6a4f0a11
GV
3469 def _find_term_columns():
3470 columns = os.environ.get('COLUMNS', None)
2c8d32de
PH
3471 if columns:
3472 return int(columns)
3473
4f2a5e06
PH
3474 try:
3475 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3476 out,err = sp.communicate()
eb0387a8 3477 return int(out.split()[1])
4f2a5e06
PH
3478 except:
3479 pass
2c8d32de 3480 return None
6a4f0a11 3481
51c8e53f
GV
3482 max_width = 80
3483 max_help_position = 80
3484
3485 # No need to wrap help messages if we're on a wide console
6a4f0a11 3486 columns = _find_term_columns()
51c8e53f
GV
3487 if columns: max_width = columns
3488
3489 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4f9f96f6
GV
3490 fmt.format_option_strings = _format_option_string
3491
3492 kw = {
3493 'version' : __version__,
3494 'formatter' : fmt,
a2f7e3a5 3495 'usage' : '%prog [options] url [url...]',
4f9f96f6
GV
3496 'conflict_handler' : 'resolve',
3497 }
3498
3499 parser = optparse.OptionParser(**kw)
3500
3501 # option groups
3502 general = optparse.OptionGroup(parser, 'General Options')
20e91e83 3503 selection = optparse.OptionGroup(parser, 'Video Selection')
4f9f96f6
GV
3504 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3505 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3506 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3507 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3508 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3509
3510 general.add_option('-h', '--help',
3511 action='help', help='print this help text and exit')
3512 general.add_option('-v', '--version',
3513 action='version', help='print program version and exit')
3514 general.add_option('-U', '--update',
e0e56865 3515 action='store_true', dest='update_self', help='update this program to latest version')
4f9f96f6
GV
3516 general.add_option('-i', '--ignore-errors',
3517 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3518 general.add_option('-r', '--rate-limit',
3519 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3520 general.add_option('-R', '--retries',
3521 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4f9f96f6
GV
3522 general.add_option('--dump-user-agent',
3523 action='store_true', dest='dump_user_agent',
3524 help='display the current browser identification', default=False)
3525
20e91e83
ABP
3526 selection.add_option('--playlist-start',
3527 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3528 selection.add_option('--playlist-end',
3529 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3530 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3531 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3532
4f9f96f6
GV
3533 authentication.add_option('-u', '--username',
3534 dest='username', metavar='USERNAME', help='account username')
3535 authentication.add_option('-p', '--password',
3536 dest='password', metavar='PASSWORD', help='account password')
3537 authentication.add_option('-n', '--netrc',
3538 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3539
3540
3541 video_format.add_option('-f', '--format',
3542 action='store', dest='format', metavar='FORMAT', help='video format code')
3543 video_format.add_option('--all-formats',
3544 action='store_const', dest='format', help='download all available video formats', const='-1')
3545 video_format.add_option('--max-quality',
3546 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3547
3548
3549 verbosity.add_option('-q', '--quiet',
3550 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3551 verbosity.add_option('-s', '--simulate',
3552 action='store_true', dest='simulate', help='do not download video', default=False)
3553 verbosity.add_option('-g', '--get-url',
3554 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3555 verbosity.add_option('-e', '--get-title',
3556 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3557 verbosity.add_option('--get-thumbnail',
3558 action='store_true', dest='getthumbnail',
3559 help='simulate, quiet but print thumbnail URL', default=False)
3560 verbosity.add_option('--get-description',
3561 action='store_true', dest='getdescription',
3562 help='simulate, quiet but print video description', default=False)
3563 verbosity.add_option('--get-filename',
3564 action='store_true', dest='getfilename',
3565 help='simulate, quiet but print output filename', default=False)
3566 verbosity.add_option('--no-progress',
3567 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3568 verbosity.add_option('--console-title',
3569 action='store_true', dest='consoletitle',
3570 help='display progress in console titlebar', default=False)
3571
3572
3573 filesystem.add_option('-t', '--title',
3574 action='store_true', dest='usetitle', help='use title in file name', default=False)
3575 filesystem.add_option('-l', '--literal',
3576 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3577 filesystem.add_option('-A', '--auto-number',
3578 action='store_true', dest='autonumber',
3579 help='number downloaded files starting from 00000', default=False)
3580 filesystem.add_option('-o', '--output',
3581 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3582 filesystem.add_option('-a', '--batch-file',
3583 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3584 filesystem.add_option('-w', '--no-overwrites',
3585 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3586 filesystem.add_option('-c', '--continue',
3587 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3588 filesystem.add_option('--cookies',
3589 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3590 filesystem.add_option('--no-part',
3591 action='store_true', dest='nopart', help='do not use .part files', default=False)
3592 filesystem.add_option('--no-mtime',
3593 action='store_false', dest='updatetime',
3594 help='do not use the Last-modified header to set the file modification time', default=True)
2c8d32de
PH
3595 filesystem.add_option('--write-description',
3596 action='store_true', dest='writedescription',
3597 help='write video description to a .description file', default=False)
3598 filesystem.add_option('--write-info-json',
3599 action='store_true', dest='writeinfojson',
3600 help='write video metadata to a .info.json file', default=False)
4f9f96f6
GV
3601
3602
3603 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3604 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3605 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3606 help='"best", "aac" or "mp3"; best by default')
3607
3608
3609 parser.add_option_group(general)
20e91e83 3610 parser.add_option_group(selection)
4f9f96f6
GV
3611 parser.add_option_group(filesystem)
3612 parser.add_option_group(verbosity)
3613 parser.add_option_group(video_format)
3614 parser.add_option_group(authentication)
3615 parser.add_option_group(postproc)
3616
3617 opts, args = parser.parse_args()
3618
3619 return parser, opts, args
3620
5adcaa43
GV
3621def main():
3622 parser, opts, args = parseOpts()
4f9f96f6 3623
5adcaa43
GV
3624 # Open appropriate CookieJar
3625 if opts.cookiefile is None:
3626 jar = cookielib.CookieJar()
3627 else:
8cc44341 3628 try:
5adcaa43
GV
3629 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3630 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3631 jar.load()
3632 except (IOError, OSError), err:
3633 sys.exit(u'ERROR: unable to open cookie file')
80066952 3634
5adcaa43
GV
3635 # Dump user agent
3636 if opts.dump_user_agent:
3637 print std_headers['User-Agent']
3638 sys.exit(0)
e7cf18cb 3639
5adcaa43
GV
3640 # General configuration
3641 cookie_processor = urllib2.HTTPCookieProcessor(jar)
c8e30044
PH
3642 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3643 urllib2.install_opener(opener)
5adcaa43 3644 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
80066952 3645
5adcaa43
GV
3646 # Batch file verification
3647 batchurls = []
3648 if opts.batchfile is not None:
8cc44341 3649 try:
5adcaa43
GV
3650 if opts.batchfile == '-':
3651 batchfd = sys.stdin
4bec29ef 3652 else:
5adcaa43
GV
3653 batchfd = open(opts.batchfile, 'r')
3654 batchurls = batchfd.readlines()
3655 batchurls = [x.strip() for x in batchurls]
3656 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3657 except IOError:
3658 sys.exit(u'ERROR: batch file could not be read')
3659 all_urls = batchurls + args
3660
3661 # Conflicting, missing and erroneous options
3662 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3663 parser.error(u'using .netrc conflicts with giving username/password')
3664 if opts.password is not None and opts.username is None:
3665 parser.error(u'account username missing')
3666 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3667 parser.error(u'using output template conflicts with using title, literal title or auto number')
3668 if opts.usetitle and opts.useliteral:
3669 parser.error(u'using title conflicts with using literal title')
3670 if opts.username is not None and opts.password is None:
3671 opts.password = getpass.getpass(u'Type account password and press return:')
3672 if opts.ratelimit is not None:
3673 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3674 if numeric_limit is None:
3675 parser.error(u'invalid rate limit specified')
3676 opts.ratelimit = numeric_limit
3677 if opts.retries is not None:
8cc44341 3678 try:
5adcaa43 3679 opts.retries = long(opts.retries)
8cc44341 3680 except (TypeError, ValueError), err:
5adcaa43
GV
3681 parser.error(u'invalid retry count specified')
3682 try:
2c8d32de 3683 opts.playliststart = int(opts.playliststart)
5adcaa43 3684 if opts.playliststart <= 0:
2c8d32de 3685 raise ValueError(u'Playlist start must be positive')
5adcaa43
GV
3686 except (TypeError, ValueError), err:
3687 parser.error(u'invalid playlist start number specified')
3688 try:
2c8d32de 3689 opts.playlistend = int(opts.playlistend)
5adcaa43 3690 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2c8d32de 3691 raise ValueError(u'Playlist end must be greater than playlist start')
5adcaa43
GV
3692 except (TypeError, ValueError), err:
3693 parser.error(u'invalid playlist end number specified')
3694 if opts.extractaudio:
3695 if opts.audioformat not in ['best', 'aac', 'mp3']:
3696 parser.error(u'invalid audio format specified')
3697
3698 # Information extractors
3699 youtube_ie = YoutubeIE()
5adcaa43 3700 google_ie = GoogleIE()
5adcaa43 3701 yahoo_ie = YahooIE()
8c5dc3ad
PH
3702 extractors = [ # Order does matter
3703 youtube_ie,
3704 MetacafeIE(youtube_ie),
3705 DailymotionIE(),
3706 YoutubePlaylistIE(youtube_ie),
3707 YoutubeUserIE(youtube_ie),
3708 YoutubeSearchIE(youtube_ie),
3709 google_ie,
3710 GoogleSearchIE(google_ie),
3711 PhotobucketIE(),
3712 yahoo_ie,
3713 YahooSearchIE(yahoo_ie),
3714 DepositFilesIE(),
3715 FacebookIE(),
3716 BlipTVIE(),
3717 VimeoIE(),
3718 MyVideoIE(),
3719 ComedyCentralIE(),
f9c68787 3720 EscapistIE(),
8c5dc3ad
PH
3721
3722 GenericIE()
3723 ]
5adcaa43
GV
3724
3725 # File downloader
3726 fd = FileDownloader({
3727 'usenetrc': opts.usenetrc,
3728 'username': opts.username,
3729 'password': opts.password,
3730 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3731 'forceurl': opts.geturl,
3732 'forcetitle': opts.gettitle,
3733 'forcethumbnail': opts.getthumbnail,
3734 'forcedescription': opts.getdescription,
3735 'forcefilename': opts.getfilename,
3736 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3737 'format': opts.format,
3738 'format_limit': opts.format_limit,
3739 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3740 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3741 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3742 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3743 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3744 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3745 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3746 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3747 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3748 or u'%(id)s.%(ext)s'),
3749 'ignoreerrors': opts.ignoreerrors,
3750 'ratelimit': opts.ratelimit,
3751 'nooverwrites': opts.nooverwrites,
3752 'retries': opts.retries,
3753 'continuedl': opts.continue_dl,
3754 'noprogress': opts.noprogress,
3755 'playliststart': opts.playliststart,
3756 'playlistend': opts.playlistend,
3757 'logtostderr': opts.outtmpl == '-',
3758 'consoletitle': opts.consoletitle,
3759 'nopart': opts.nopart,
3760 'updatetime': opts.updatetime,
2c8d32de
PH
3761 'writedescription': opts.writedescription,
3762 'writeinfojson': opts.writeinfojson,
20e91e83
ABP
3763 'matchtitle': opts.matchtitle,
3764 'rejecttitle': opts.rejecttitle,
5adcaa43 3765 })
8c5dc3ad
PH
3766 for extractor in extractors:
3767 fd.add_info_extractor(extractor)
5adcaa43
GV
3768
3769 # PostProcessors
3770 if opts.extractaudio:
3771 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3772
3773 # Update version
3774 if opts.update_self:
3775 updateSelf(fd, sys.argv[0])
3776
3777 # Maybe do nothing
3778 if len(all_urls) < 1:
3779 if not opts.update_self:
3780 parser.error(u'you must provide at least one URL')
3781 else:
3782 sys.exit()
3783 retcode = fd.download(all_urls)
80066952 3784
5adcaa43
GV
3785 # Dump cookie jar if requested
3786 if opts.cookiefile is not None:
3787 try:
3788 jar.save()
3789 except (IOError, OSError), err:
3790 sys.exit(u'ERROR: unable to save cookie jar')
80066952 3791
5adcaa43 3792 sys.exit(retcode)
80066952 3793
4fa74b52 3794
5adcaa43
GV
3795if __name__ == '__main__':
3796 try:
3797 main()
e5bf0f55
RG
3798 except DownloadError:
3799 sys.exit(1)
3800 except SameFileError:
76a7f364 3801 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 3802 except KeyboardInterrupt:
76a7f364 3803 sys.exit(u'\nERROR: Interrupted by user')
e9cb9c28
GV
3804
3805# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: