]> jfr.im git - yt-dlp.git/blame - youtube-dl
Quick and dirty IE for xvideos.com.
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
2770590d
GV
3
4__author__ = (
2c8d32de
PH
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
6ae796b1 12 'Rogério Brito',
eb11aacc 13 'Philipp Hagemeister',
6fc5b0bb 14 'Sören Schulze',
2770590d
GV
15 )
16
2c8d32de 17__license__ = 'Public Domain'
54d47874 18__version__ = '2011.09.30'
2770590d 19
8236e851 20UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
d207e7cf 21
80066952 22import cookielib
a1f03c7b 23import datetime
1987c232 24import gzip
4fa74b52 25import htmlentitydefs
f9c68787 26import HTMLParser
4fa74b52 27import httplib
2546e767 28import locale
4fa74b52
RG
29import math
30import netrc
31import os
32import os.path
33import re
34import socket
35import string
0487b407 36import subprocess
4fa74b52
RG
37import sys
38import time
39import urllib
40import urllib2
c6b55a8d 41import warnings
1987c232 42import zlib
a04e80a4 43
0a3c8b62
PH
44if os.name == 'nt':
45 import ctypes
46
47try:
48 import email.utils
49except ImportError: # Python 2.4
50 import email.Utils
c6b55a8d
PH
51try:
52 import cStringIO as StringIO
53except ImportError:
54 import StringIO
55
a04e80a4
RG
56# parse_qs was moved from the cgi module to the urlparse module recently.
57try:
58 from urlparse import parse_qs
59except ImportError:
60 from cgi import parse_qs
4fa74b52 61
c6b55a8d
PH
62try:
63 import lxml.etree
2b70537d 64except ImportError:
c6b55a8d
PH
65 pass # Handled below
66
c8e30044
PH
67try:
68 import xml.etree.ElementTree
afb5b55d
PH
69except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
c8e30044 71
f995f712 72std_headers = {
c44b9ee9 73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
4fa74b52 74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
a57ed21f 76 'Accept-Encoding': 'gzip, deflate',
4fa74b52
RG
77 'Accept-Language': 'en-us,en;q=0.5',
78}
79
80simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
437d76c1
PH
82try:
83 import json
91e6a385 84except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
437d76c1
PH
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
eae2666c
RG
195def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
f94b636c
RG
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
eae2666c 210
c0a10ca8 211
490fd7ae
RG
212def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
d3975459 214
490fd7ae
RG
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
c0a10ca8 238
490fd7ae 239def sanitize_title(utitle):
31bcb480 240 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
242 return utitle.replace(unicode(os.sep), u'%')
243
c0a10ca8 244
31bcb480
RG
245def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
131bc765 256 if filename == u'-':
e08878f4
RG
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 260 return (sys.stdout, filename)
31bcb480
RG
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
ca6a11fa 265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
c0a10ca8 271
09bd408c 272def timeconvert(timestr):
c0a10ca8
F
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
09bd408c 280
e5bf0f55
RG
281class DownloadError(Exception):
282 """Download Error exception.
d3975459 283
e5bf0f55
RG
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
286 error message.
287 """
288 pass
289
c0a10ca8 290
e5bf0f55
RG
291class SameFileError(Exception):
292 """Same File exception.
293
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
296 """
297 pass
298
c0a10ca8 299
65cd34c5
RG
300class PostProcessingError(Exception):
301 """Post Processing exception.
302
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
305 """
306 pass
307
c0a10ca8 308
73f4e7af 309class UnavailableVideoError(Exception):
7b7759f5 310 """Unavailable Format exception.
311
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
314 """
d69a1c91
RG
315 pass
316
c0a10ca8 317
d69a1c91
RG
318class ContentTooShortError(Exception):
319 """Content Too Short exception.
320
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
324 """
325 # Both in bytes
326 downloaded = None
327 expected = None
328
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
7b7759f5 332
c0a10ca8 333
1987c232
RG
334class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
336
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
c0a10ca8 343
1987c232
RG
344 Part of this code was copied from:
345
c0a10ca8
F
346 http://techknack.net/python-urllib2-handlers/
347
1987c232
RG
348 Andrew Rowls, the author of that code, agreed to release it to the
349 public domain.
350 """
351
352 @staticmethod
353 def deflate(data):
354 try:
355 return zlib.decompress(data, -zlib.MAX_WBITS)
356 except zlib.error:
357 return zlib.decompress(data)
c0a10ca8 358
7b531c0b
RG
359 @staticmethod
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
0f6b00b5
RG
363 ret = urllib2.addinfourl(stream, headers, url)
364 ret.code = code
365 return ret
c0a10ca8 366
1987c232
RG
367 def http_request(self, req):
368 for h in std_headers:
369 if h in req.headers:
370 del req.headers[h]
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
376 return req
377
378 def http_response(self, req, resp):
379 old_resp = resp
380 # gzip
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
7b531c0b 383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
384 resp.msg = old_resp.msg
385 # deflate
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
7b531c0b 388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
389 resp.msg = old_resp.msg
390 return resp
391
c0a10ca8 392
4fa74b52
RG
393class FileDownloader(object):
394 """File Downloader class.
395
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
402
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
4fa74b52
RG
410
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
4fa74b52
RG
417
418 Available options:
419
80066952
RG
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
9f796346 428 forcefilename: Force printing final filename.
80066952
RG
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
8cc44341 440 playlistend: Playlist item to end at.
20e91e83
ABP
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
331ce0a0 443 logtostderr: Log messages to stderr instead of stdout.
ccbd296b 444 consoletitle: Display progress in console window's titlebar.
3fb2c487 445 nopart: Do not use temporary .part files.
e3018902 446 updatetime: Use the Last-modified header to set output file timestamps.
8b95c387 447 writedescription: Write the video description to a .description file
6eb08fbf 448 writeinfojson: Write the video description to a .info.json file
4fa74b52
RG
449 """
450
d0a9affb 451 params = None
4fa74b52 452 _ies = []
65cd34c5 453 _pps = []
9bf386d7 454 _download_retcode = None
7d8d0612 455 _num_downloads = None
331ce0a0 456 _screen_file = None
4fa74b52
RG
457
458 def __init__(self, params):
1c5e2302 459 """Create a FileDownloader object with the given options."""
4fa74b52 460 self._ies = []
65cd34c5 461 self._pps = []
9bf386d7 462 self._download_retcode = 0
7d8d0612 463 self._num_downloads = 0
331ce0a0 464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 465 self.params = params
d3975459 466
4fa74b52
RG
467 @staticmethod
468 def format_bytes(bytes):
469 if bytes is None:
470 return 'N/A'
8497c36d
RG
471 if type(bytes) is str:
472 bytes = float(bytes)
473 if bytes == 0.0:
4fa74b52
RG
474 exponent = 0
475 else:
8497c36d 476 exponent = long(math.log(bytes, 1024.0))
4fa74b52 477 suffix = 'bkMGTPEZY'[exponent]
c0a10ca8 478 converted = float(bytes) / float(1024 ** exponent)
4fa74b52
RG
479 return '%.2f%s' % (converted, suffix)
480
481 @staticmethod
482 def calc_percent(byte_counter, data_len):
483 if data_len is None:
484 return '---.-%'
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487 @staticmethod
488 def calc_eta(start, now, total, current):
489 if total is None:
490 return '--:--'
491 dif = now - start
492 if current == 0 or dif < 0.001: # One millisecond
493 return '--:--'
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
497 if eta_mins > 99:
498 return '--:--'
499 return '%02d:%02d' % (eta_mins, eta_secs)
500
5121ef20 501 @staticmethod
4fa74b52
RG
502 def calc_speed(start, now, bytes):
503 dif = now - start
504 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 505 return '%10s' % '---b/s'
4fa74b52
RG
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508 @staticmethod
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
e1f18b8a 513 return long(new_max)
4fa74b52
RG
514 rate = bytes / elapsed_time
515 if rate > new_max:
e1f18b8a 516 return long(new_max)
4fa74b52 517 if rate < new_min:
e1f18b8a
RG
518 return long(new_min)
519 return long(rate)
4fa74b52 520
acd3d842
RG
521 @staticmethod
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525 if matchobj is None:
526 return None
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
530
4fa74b52
RG
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
533 self._ies.append(ie)
534 ie.set_downloader(self)
d3975459 535
65cd34c5
RG
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
538 self._pps.append(pp)
539 pp.set_downloader(self)
d3975459 540
331ce0a0 541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 542 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
543 try:
544 if not self.params.get('quiet', False):
331ce0a0
RG
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
43ab0ca4
RG
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
550 raise
d3975459 551
7e5cab67
RG
552 def to_stderr(self, message):
553 """Print message to stderr."""
eae2666c 554 print >>sys.stderr, message.encode(preferredencoding())
d3975459 555
ccbd296b
MM
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
559 return
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
22899cea
RG
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
d0a9affb 569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 570
0086d1ec
RG
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
573
574 Depending on if the downloader has been configured to ignore
e5bf0f55 575 download errors or not, this method may throw an exception or
9bf386d7 576 not when errors are found, after printing the message.
0086d1ec
RG
577 """
578 if message is not None:
579 self.to_stderr(message)
d0a9affb 580 if not self.params.get('ignoreerrors', False):
e5bf0f55 581 raise DownloadError(message)
9bf386d7 582 self._download_retcode = 1
0086d1ec 583
acd3d842
RG
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
d0a9affb 586 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
587 if rate_limit is None or byte_counter == 0:
588 return
589 now = time.time()
590 elapsed = now - start_time
591 if elapsed <= 0.0:
592 return
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
3fb2c487
RG
596
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
601 return filename
602 return filename + u'.part'
603
8cc42e7c
RG
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
607 return filename
608
62cf7aaf
RG
609 def try_rename(self, old_filename, new_filename):
610 try:
7d950ca1
RG
611 if old_filename == new_filename:
612 return
62cf7aaf
RG
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
c0a10ca8 616
e3018902
RG
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
620 return
621 if not os.path.isfile(filename):
622 return
623 timestr = last_modified_hdr
624 if timestr is None:
625 return
626 filetime = timeconvert(timestr)
627 if filetime is None:
36597dc4 628 return filetime
e3018902 629 try:
c0a10ca8 630 os.utime(filename, (time.time(), filetime))
e3018902
RG
631 except:
632 pass
36597dc4 633 return filetime
acd3d842 634
8b95c387 635 def report_writedescription(self, descfn):
6eb08fbf
PH
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
8b95c387 642
bafa5cd9
RG
643 def report_destination(self, filename):
644 """Report destination filename."""
331ce0a0 645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
d3975459 646
bafa5cd9
RG
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
d9835247
RG
649 if self.params.get('noprogress', False):
650 return
331ce0a0 651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
ccbd296b
MM
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
7db85b2c
RG
655
656 def report_resuming_byte(self, resume_len):
8a9f53be 657 """Report attempt to resume at given byte."""
331ce0a0 658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
d3975459 659
7031008c 660 def report_retry(self, count, retries):
e86e9474 661 """Report retry in case of HTTP error 5xx"""
331ce0a0 662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
d3975459 663
7db85b2c
RG
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
43ab0ca4 666 try:
331ce0a0 667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 668 except (UnicodeEncodeError), err:
331ce0a0 669 self.to_screen(u'[download] The file has already been downloaded')
d3975459 670
7db85b2c
RG
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
331ce0a0 673 self.to_screen(u'[download] Unable to resume')
d3975459 674
bafa5cd9
RG
675 def report_finish(self):
676 """Report download finished."""
d9835247 677 if self.params.get('noprogress', False):
331ce0a0 678 self.to_screen(u'[download] Download completed')
d9835247 679 else:
331ce0a0 680 self.to_screen(u'')
d3975459 681
df372a65
RG
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
bafa5cd9 685
9f796346
GI
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
688 try:
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
693 return filename
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
696 return None
697
c8619e01
RG
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
9f796346 700 filename = self.prepare_filename(info_dict)
9b4556c4
PH
701
702 # Forced printings
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
da0db53a
DH
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
9b4556c4 715
c8619e01
RG
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
9bf386d7 718 return
d3975459 719
9f796346 720 if filename is None:
38ed1344 721 return
20e91e83
ABP
722
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728 return
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731 return
732
850ab765 733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 734 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 735 return
7b7759f5 736
c8619e01 737 try:
e5e74ffb
PH
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
740 os.makedirs(dn)
c8619e01 741 except (OSError, IOError), err:
cec3a53c 742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
9bf386d7 743 return
7b7759f5 744
8b95c387
PH
745 if self.params.get('writedescription', False):
746 try:
747 descfn = filename + '.description'
6eb08fbf 748 self.report_writedescription(descfn)
1293ce58
PH
749 descfile = open(descfn, 'wb')
750 try:
8b95c387 751 descfile.write(info_dict['description'].encode('utf-8'))
1293ce58
PH
752 finally:
753 descfile.close()
8b95c387 754 except (OSError, IOError):
cec3a53c 755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
8b95c387
PH
756 return
757
6eb08fbf
PH
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
761 try:
762 json.dump
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765 return
766 try:
1293ce58
PH
767 infof = open(infofn, 'wb')
768 try:
54f329fe
PH
769 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
770 json.dump(json_info_dict, infof)
1293ce58
PH
771 finally:
772 infof.close()
6eb08fbf 773 except (OSError, IOError):
cec3a53c 774 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
6eb08fbf
PH
775 return
776
9b4556c4 777 if not self.params.get('skip_download', False):
55e7c75e 778 try:
366cbfb0 779 success = self._do_download(filename, info_dict)
9b4556c4
PH
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784 return
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
55e7c75e 787 return
9b4556c4
PH
788
789 if success:
790 try:
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
794 return
c8619e01 795
4fa74b52
RG
796 def download(self, url_list):
797 """Download a given list of URLs."""
22899cea 798 if len(url_list) > 1 and self.fixed_template():
d0a9affb 799 raise SameFileError(self.params['outtmpl'])
22899cea 800
4fa74b52
RG
801 for url in url_list:
802 suitable_found = False
803 for ie in self._ies:
c8619e01 804 # Go to next InfoExtractor if not suitable
4fa74b52
RG
805 if not ie.suitable(url):
806 continue
c8619e01 807
4fa74b52
RG
808 # Suitable InfoExtractor found
809 suitable_found = True
c8619e01 810
6f21f686
RG
811 # Extract information from URL and process it
812 ie.extract(url)
65cd34c5 813
c8619e01 814 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 815 break
c8619e01 816
4fa74b52 817 if not suitable_found:
db7e31b8 818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 819
9bf386d7 820 return self._download_retcode
65cd34c5
RG
821
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
824 info = dict(ie_info)
825 info['filepath'] = filename
826 for pp in self._pps:
827 info = pp.run(info)
828 if info is None:
829 break
d3975459 830
e616ec0c 831 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407 832 self.report_destination(filename)
62cf7aaf 833 tmpfilename = self.temp_name(filename)
0487b407
RG
834
835 # Check for rtmpdump first
836 try:
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840 return False
841
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
b487ef08 845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
1c1821f8
RG
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
62cf7aaf 848 prevsize = os.path.getsize(tmpfilename)
331ce0a0 849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 850 time.sleep(5.0) # This seems to be needed
1c1821f8 851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
62cf7aaf 852 cursize = os.path.getsize(tmpfilename)
e616ec0c
RG
853 if prevsize == cursize and retval == 1:
854 break
b487ef08
PH
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
858 retval = 0
859 break
0487b407 860 if retval == 0:
62cf7aaf
RG
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
0487b407
RG
863 return True
864 else:
db7e31b8 865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
866 return False
867
366cbfb0
PH
868 def _do_download(self, filename, info_dict):
869 url = info_dict['url']
870 player_url = info_dict.get('player_url', None)
871
62cf7aaf 872 # Check file already present
3fb2c487 873 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
62cf7aaf
RG
874 self.report_file_already_downloaded(filename)
875 return True
876
0487b407
RG
877 # Attempt to download using rtmpdump
878 if url.startswith('rtmp'):
e616ec0c 879 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 880
62cf7aaf 881 tmpfilename = self.temp_name(filename)
55e7c75e 882 stream = None
1987c232
RG
883
884 # Do not include the Accept-Encoding header
885 headers = {'Youtubedl-no-compression': 'True'}
886 basic_request = urllib2.Request(url, None, headers)
887 request = urllib2.Request(url, None, headers)
7db85b2c 888
9c457d2a 889 # Establish possible resume length
62cf7aaf
RG
890 if os.path.isfile(tmpfilename):
891 resume_len = os.path.getsize(tmpfilename)
55e7c75e
RG
892 else:
893 resume_len = 0
9c457d2a 894
10e7194d
MH
895 open_mode = 'wb'
896 if resume_len != 0:
897 if self.params.get('continuedl', False):
898 self.report_resuming_byte(resume_len)
899 request.add_header('Range','bytes=%d-' % resume_len)
900 open_mode = 'ab'
901 else:
902 resume_len = 0
55e7c75e 903
7031008c
RG
904 count = 0
905 retries = self.params.get('retries', 0)
101e0d1e 906 while count <= retries:
7031008c
RG
907 # Establish connection
908 try:
54f329fe
PH
909 if count == 0 and 'urlhandle' in info_dict:
910 data = info_dict['urlhandle']
7031008c
RG
911 data = urllib2.urlopen(request)
912 break
913 except (urllib2.HTTPError, ), err:
ac249f42 914 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 915 # Unexpected HTTP error
7031008c 916 raise
101e0d1e
RG
917 elif err.code == 416:
918 # Unable to resume (requested range not satisfiable)
919 try:
920 # Open the connection again without the range header
921 data = urllib2.urlopen(basic_request)
922 content_length = data.info()['Content-Length']
923 except (urllib2.HTTPError, ), err:
ac249f42 924 if err.code < 500 or err.code >= 600:
101e0d1e
RG
925 raise
926 else:
927 # Examine the reported length
268fb2bd 928 if (content_length is not None and
c0a10ca8 929 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
930 # The file had already been fully downloaded.
931 # Explanation to the above condition: in issue #175 it was revealed that
932 # YouTube sometimes adds or removes a few bytes from the end of the file,
933 # changing the file size slightly and causing problems for some users. So
934 # I decided to implement a suggested change and consider the file
935 # completely downloaded if the file size differs less than 100 bytes from
936 # the one in the hard drive.
101e0d1e 937 self.report_file_already_downloaded(filename)
62cf7aaf 938 self.try_rename(tmpfilename, filename)
101e0d1e
RG
939 return True
940 else:
941 # The length does not match, we start the download over
942 self.report_unable_to_resume()
943 open_mode = 'wb'
944 break
945 # Retry
946 count += 1
947 if count <= retries:
948 self.report_retry(count, retries)
949
950 if count > retries:
951 self.trouble(u'ERROR: giving up after %s retries' % retries)
952 return False
7db85b2c 953
4fa74b52 954 data_len = data.info().get('Content-length', None)
106d091e
RG
955 if data_len is not None:
956 data_len = long(data_len) + resume_len
4fa74b52 957 data_len_str = self.format_bytes(data_len)
106d091e 958 byte_counter = 0 + resume_len
4fa74b52
RG
959 block_size = 1024
960 start = time.time()
961 while True:
bafa5cd9 962 # Download and write
4fa74b52
RG
963 before = time.time()
964 data_block = data.read(block_size)
965 after = time.time()
975a91d0 966 if len(data_block) == 0:
4fa74b52 967 break
975a91d0 968 byte_counter += len(data_block)
55e7c75e
RG
969
970 # Open file just in time
971 if stream is None:
972 try:
62cf7aaf 973 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
dbddab27 974 assert stream is not None
8cc42e7c 975 filename = self.undo_temp_name(tmpfilename)
55e7c75e
RG
976 self.report_destination(filename)
977 except (OSError, IOError), err:
db7e31b8 978 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 979 return False
131efd1a
RG
980 try:
981 stream.write(data_block)
982 except (IOError, OSError), err:
d67e0974
RG
983 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
984 return False
975a91d0 985 block_size = self.best_block_size(after - before, len(data_block))
4fa74b52 986
55e7c75e 987 # Progress message
975a91d0 988 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
67035ede
PH
989 if data_len is None:
990 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
991 else:
992 percent_str = self.calc_percent(byte_counter, data_len)
993 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
994 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
55e7c75e 995
acd3d842 996 # Apply rate limit
975a91d0 997 self.slow_down(start, byte_counter - resume_len)
acd3d842 998
dbddab27
PH
999 if stream is None:
1000 self.trouble(u'\nERROR: Did not get any data blocks')
1001 return False
6f0ff3ba 1002 stream.close()
bafa5cd9 1003 self.report_finish()
b905e5f5 1004 if data_len is not None and byte_counter != data_len:
d69a1c91 1005 raise ContentTooShortError(byte_counter, long(data_len))
62cf7aaf 1006 self.try_rename(tmpfilename, filename)
e3018902 1007
09bd408c 1008 # Update file modification time
e3018902 1009 if self.params.get('updatetime', True):
366cbfb0 1010 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
e3018902 1011
366cbfb0 1012 return True
4fa74b52 1013
c0a10ca8 1014
4fa74b52
RG
1015class InfoExtractor(object):
1016 """Information Extractor class.
1017
1018 Information extractors are the classes that, given a URL, extract
1019 information from the video (or videos) the URL refers to. This
1020 information includes the real video URL, the video title and simplified
2851b2ca
RG
1021 title, author and others. The information is stored in a dictionary
1022 which is then passed to the FileDownloader. The FileDownloader
1023 processes this information possibly downloading the video to the file
1024 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
1025 the following fields:
1026
1027 id: Video identifier.
1028 url: Final video URL.
1029 uploader: Nickname of the video uploader.
1030 title: Literal title.
1031 stitle: Simplified title.
1032 ext: Video filename extension.
6ba562b0 1033 format: Video format.
e616ec0c 1034 player_url: SWF Player URL (may be None).
4fa74b52 1035
7e58d568
RG
1036 The following fields are optional. Their primary purpose is to allow
1037 youtube-dl to serve as the backend for a video search function, such
1038 as the one in youtube2mp3. They are only used when their respective
1039 forced printing functions are called:
1040
1041 thumbnail: Full URL to a video thumbnail image.
1042 description: One-line video description.
1043
4fa74b52 1044 Subclasses of this one should re-define the _real_initialize() and
bdb3f7a7
PH
1045 _real_extract() methods and define a _VALID_URL regexp.
1046 Probably, they should also be added to the list of extractors.
4fa74b52
RG
1047 """
1048
1049 _ready = False
1050 _downloader = None
1051
1052 def __init__(self, downloader=None):
1053 """Constructor. Receives an optional downloader."""
1054 self._ready = False
1055 self.set_downloader(downloader)
1056
bdb3f7a7 1057 def suitable(self, url):
4fa74b52 1058 """Receives a URL and returns True if suitable for this IE."""
bdb3f7a7 1059 return re.match(self._VALID_URL, url) is not None
4fa74b52
RG
1060
1061 def initialize(self):
1c5e2302 1062 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
1063 if not self._ready:
1064 self._real_initialize()
1065 self._ready = True
1066
1067 def extract(self, url):
1068 """Extracts URL information and returns it in list of dicts."""
1069 self.initialize()
1070 return self._real_extract(url)
1071
1072 def set_downloader(self, downloader):
1073 """Sets the downloader for this IE."""
1074 self._downloader = downloader
d3975459 1075
4fa74b52
RG
1076 def _real_initialize(self):
1077 """Real initialization process. Redefine in subclasses."""
1078 pass
1079
1080 def _real_extract(self, url):
1081 """Real extraction process. Redefine in subclasses."""
1082 pass
1083
c0a10ca8 1084
4fa74b52
RG
1085class YoutubeIE(InfoExtractor):
1086 """Information extractor for youtube.com."""
1087
1cde6f1d 1088 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 1089 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 1090 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 1091 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 1092 _NETRC_MACHINE = 'youtube'
497cd3e6 1093 # Listed in order of quality
767414a2 1094 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
7b7759f5 1095 _video_extensions = {
1096 '13': '3gp',
1097 '17': 'mp4',
1098 '18': 'mp4',
1099 '22': 'mp4',
d9bc015b 1100 '37': 'mp4',
9e9647d9 1101 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a 1102 '43': 'webm',
7b417b38 1103 '44': 'webm',
0b59bf4a 1104 '45': 'webm',
7b7759f5 1105 }
3de2a1e6
FT
1106 _video_dimensions = {
1107 '5': '240x400',
1108 '6': '???',
1109 '13': '???',
1110 '17': '144x176',
1111 '18': '360x640',
1112 '22': '720x1280',
1113 '34': '360x640',
1114 '35': '480x854',
1115 '37': '1080x1920',
1116 '38': '3072x4096',
1117 '43': '360x640',
1118 '44': '480x854',
1119 '45': '720x1280',
1120 }
f3098c4d 1121 IE_NAME = u'youtube'
4fa74b52 1122
72ac78b8
RG
1123 def report_lang(self):
1124 """Report attempt to set language."""
331ce0a0 1125 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 1126
bafa5cd9
RG
1127 def report_login(self):
1128 """Report attempt to log in."""
331ce0a0 1129 self._downloader.to_screen(u'[youtube] Logging in')
d3975459 1130
bafa5cd9
RG
1131 def report_age_confirmation(self):
1132 """Report attempt to confirm age."""
331ce0a0 1133 self._downloader.to_screen(u'[youtube] Confirming age')
d3975459 1134
e616ec0c
RG
1135 def report_video_webpage_download(self, video_id):
1136 """Report attempt to download video webpage."""
331ce0a0 1137 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
d3975459 1138
71b7300e
RG
1139 def report_video_info_webpage_download(self, video_id):
1140 """Report attempt to download video info webpage."""
331ce0a0 1141 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
d3975459 1142
bafa5cd9
RG
1143 def report_information_extraction(self, video_id):
1144 """Report attempt to extract video information."""
331ce0a0 1145 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
d3975459 1146
7b7759f5 1147 def report_unavailable_format(self, video_id, format):
1148 """Report extracted video URL."""
331ce0a0 1149 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
d3975459 1150
0487b407
RG
1151 def report_rtmp_download(self):
1152 """Indicate the download will use the RTMP protocol."""
331ce0a0 1153 self._downloader.to_screen(u'[youtube] RTMP download detected')
d3975459 1154
3de2a1e6
FT
1155 def _print_formats(self, formats):
1156 print 'Available formats:'
1157 for x in formats:
1158 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1159
4fa74b52
RG
1160 def _real_initialize(self):
1161 if self._downloader is None:
1162 return
1163
1164 username = None
1165 password = None
d0a9affb 1166 downloader_params = self._downloader.params
4fa74b52
RG
1167
1168 # Attempt to use provided username and password or .netrc data
1169 if downloader_params.get('username', None) is not None:
1170 username = downloader_params['username']
1171 password = downloader_params['password']
1172 elif downloader_params.get('usenetrc', False):
1173 try:
1174 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1175 if info is not None:
1176 username = info[0]
1177 password = info[2]
1178 else:
1179 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1180 except (IOError, netrc.NetrcParseError), err:
6f21f686 1181 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
1182 return
1183
72ac78b8 1184 # Set language
1987c232 1185 request = urllib2.Request(self._LANG_URL)
72ac78b8
RG
1186 try:
1187 self.report_lang()
1188 urllib2.urlopen(request).read()
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1190 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
1191 return
1192
cc109403
RG
1193 # No authentication to be performed
1194 if username is None:
1195 return
1196
4fa74b52 1197 # Log in
9fcd8355
RG
1198 login_form = {
1199 'current_form': 'loginForm',
4fa74b52
RG
1200 'next': '/',
1201 'action_login': 'Log In',
1202 'username': username,
9fcd8355
RG
1203 'password': password,
1204 }
1987c232 1205 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
4fa74b52 1206 try:
bafa5cd9 1207 self.report_login()
4fa74b52
RG
1208 login_results = urllib2.urlopen(request).read()
1209 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 1210 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
1211 return
1212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1213 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52 1214 return
d3975459 1215
4fa74b52 1216 # Confirm age
9fcd8355
RG
1217 age_form = {
1218 'next_url': '/',
1219 'action_confirm': 'Confirm',
1220 }
1987c232 1221 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
4fa74b52 1222 try:
bafa5cd9 1223 self.report_age_confirmation()
4fa74b52
RG
1224 age_results = urllib2.urlopen(request).read()
1225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1226 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 1227 return
4fa74b52
RG
1228
1229 def _real_extract(self, url):
1230 # Extract video id from URL
020f7150 1231 mobj = re.match(self._VALID_URL, url)
4fa74b52 1232 if mobj is None:
147753eb 1233 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1234 return
4fa74b52
RG
1235 video_id = mobj.group(2)
1236
497cd3e6
RG
1237 # Get video webpage
1238 self.report_video_webpage_download(video_id)
8d89fbae 1239 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
497cd3e6
RG
1240 try:
1241 video_webpage = urllib2.urlopen(request).read()
1242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1244 return
968aa884 1245
497cd3e6 1246 # Attempt to extract SWF player URL
b620a5f8 1247 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
497cd3e6 1248 if mobj is not None:
b620a5f8 1249 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
497cd3e6
RG
1250 else:
1251 player_url = None
1252
1253 # Get video info
1254 self.report_video_info_webpage_download(video_id)
1255 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1256 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c0a10ca8 1257 % (video_id, el_type))
1987c232 1258 request = urllib2.Request(video_info_url)
e616ec0c 1259 try:
497cd3e6
RG
1260 video_info_webpage = urllib2.urlopen(request).read()
1261 video_info = parse_qs(video_info_webpage)
1262 if 'token' in video_info:
1263 break
e616ec0c 1264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 1265 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 1266 return
f95f29fd
RG
1267 if 'token' not in video_info:
1268 if 'reason' in video_info:
8e686771 1269 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
1270 else:
1271 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1272 return
1273
1274 # Start extracting information
497cd3e6
RG
1275 self.report_information_extraction(video_id)
1276
1277 # uploader
1278 if 'author' not in video_info:
1279 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1280 return
1281 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 1282
497cd3e6
RG
1283 # title
1284 if 'title' not in video_info:
1285 self._downloader.trouble(u'ERROR: unable to extract video title')
1286 return
1287 video_title = urllib.unquote_plus(video_info['title'][0])
1288 video_title = video_title.decode('utf-8')
1289 video_title = sanitize_title(video_title)
1290
1291 # simplified title
1292 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1293 simple_title = simple_title.strip(ur'_')
1294
1295 # thumbnail image
1296 if 'thumbnail_url' not in video_info:
1297 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1298 video_thumbnail = ''
1299 else: # don't panic if we can't find it
1300 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1301
b3a27b52
NA
1302 # upload date
1303 upload_date = u'NA'
3efa45c3 1304 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
b3a27b52 1305 if mobj is not None:
a1f03c7b 1306 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
87cbd213 1307 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
a1f03c7b
NA
1308 for expression in format_expressions:
1309 try:
1310 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1311 except:
1312 pass
b3a27b52 1313
497cd3e6 1314 # description
c6b55a8d
PH
1315 try:
1316 lxml.etree
1317 except NameError:
1318 video_description = u'No description available.'
8b95c387 1319 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
c6b55a8d
PH
1320 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1321 if mobj is not None:
1322 video_description = mobj.group(1).decode('utf-8')
1323 else:
1324 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1325 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1326 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
91e6a385 1327 # TODO use another parser
497cd3e6 1328
5ce7d172
RG
1329 # token
1330 video_token = urllib.unquote_plus(video_info['token'][0])
1331
497cd3e6 1332 # Decide which formats to download
f83ae781 1333 req_format = self._downloader.params.get('format', None)
2e3a32e4 1334
f137bef9
PH
1335 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1336 self.report_rtmp_download()
1337 video_url_list = [(None, video_info['conn'][0])]
f137bef9 1338 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
0ac22e4f 1339 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
8519c32d 1340 url_data = [parse_qs(uds) for uds in url_data_strs]
f137bef9 1341 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
8519c32d 1342 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
2b70537d 1343
497cd3e6
RG
1344 format_limit = self._downloader.params.get('format_limit', None)
1345 if format_limit is not None and format_limit in self._available_formats:
1346 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 1347 else:
497cd3e6
RG
1348 format_list = self._available_formats
1349 existing_formats = [x for x in format_list if x in url_map]
1350 if len(existing_formats) == 0:
1351 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 1352 return
3de2a1e6
FT
1353 if self._downloader.params.get('listformats', None):
1354 self._print_formats(existing_formats)
2761012f 1355 return
5260e68f 1356 if req_format is None or req_format == 'best':
d157d259 1357 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
c52b01f3
K
1358 elif req_format == 'worst':
1359 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
5260e68f 1360 elif req_format in ('-1', 'all'):
d157d259 1361 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
497cd3e6 1362 else:
5260e68f
PH
1363 # Specific formats. We pick the first in a slash-delimeted sequence.
1364 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1365 req_formats = req_format.split('/')
1366 video_url_list = None
1367 for rf in req_formats:
1368 if rf in url_map:
1369 video_url_list = [(rf, url_map[rf])]
1370 break
1371 if video_url_list is None:
5c132793
RG
1372 self._downloader.trouble(u'ERROR: requested format not available')
1373 return
497cd3e6 1374 else:
f3dc18d8 1375 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
497cd3e6 1376 return
7b7759f5 1377
497cd3e6
RG
1378 for format_param, video_real_url in video_url_list:
1379 # At this point we have a new video
1380 self._downloader.increment_downloads()
1381
1382 # Extension
1383 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 1384
968aa884 1385 try:
7b7759f5 1386 # Process video information
1387 self._downloader.process_info({
1388 'id': video_id.decode('utf-8'),
1389 'url': video_real_url.decode('utf-8'),
1390 'uploader': video_uploader.decode('utf-8'),
138b11f3 1391 'upload_date': upload_date,
7b7759f5 1392 'title': video_title,
1393 'stitle': simple_title,
1394 'ext': video_extension.decode('utf-8'),
6ba562b0 1395 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568 1396 'thumbnail': video_thumbnail.decode('utf-8'),
c6b55a8d 1397 'description': video_description,
e616ec0c 1398 'player_url': player_url,
7b7759f5 1399 })
497cd3e6 1400 except UnavailableVideoError, err:
09cc744c 1401 self._downloader.trouble(u'\nERROR: unable to download video')
42bcd27d 1402
4fa74b52 1403
020f7150
RG
1404class MetacafeIE(InfoExtractor):
1405 """Information Extractor for metacafe.com."""
1406
1407 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 1408 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 1409 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150 1410 _youtube_ie = None
f3098c4d 1411 IE_NAME = u'metacafe'
020f7150
RG
1412
1413 def __init__(self, youtube_ie, downloader=None):
1414 InfoExtractor.__init__(self, downloader)
1415 self._youtube_ie = youtube_ie
1416
020f7150
RG
1417 def report_disclaimer(self):
1418 """Report disclaimer retrieval."""
331ce0a0 1419 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
1420
1421 def report_age_confirmation(self):
1422 """Report attempt to confirm age."""
331ce0a0 1423 self._downloader.to_screen(u'[metacafe] Confirming age')
d3975459 1424
020f7150
RG
1425 def report_download_webpage(self, video_id):
1426 """Report webpage download."""
331ce0a0 1427 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
d3975459 1428
020f7150
RG
1429 def report_extraction(self, video_id):
1430 """Report information extraction."""
331ce0a0 1431 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
1432
1433 def _real_initialize(self):
1434 # Retrieve disclaimer
1987c232 1435 request = urllib2.Request(self._DISCLAIMER)
020f7150
RG
1436 try:
1437 self.report_disclaimer()
1438 disclaimer = urllib2.urlopen(request).read()
1439 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1440 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1441 return
1442
1443 # Confirm age
1444 disclaimer_form = {
2546e767 1445 'filters': '0',
020f7150
RG
1446 'submit': "Continue - I'm over 18",
1447 }
1987c232 1448 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
020f7150
RG
1449 try:
1450 self.report_age_confirmation()
1451 disclaimer = urllib2.urlopen(request).read()
1452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1453 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150 1454 return
d3975459 1455
020f7150
RG
1456 def _real_extract(self, url):
1457 # Extract id and simplified title from URL
1458 mobj = re.match(self._VALID_URL, url)
1459 if mobj is None:
147753eb 1460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1461 return
020f7150
RG
1462
1463 video_id = mobj.group(1)
1464
1465 # Check if video comes from YouTube
1466 mobj2 = re.match(r'^yt-(.*)$', video_id)
1467 if mobj2 is not None:
6f21f686
RG
1468 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1469 return
020f7150 1470
df372a65 1471 # At this point we have a new video
9bf7fa52 1472 self._downloader.increment_downloads()
df372a65 1473
020f7150 1474 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1475
1476 # Retrieve video webpage to extract further information
1477 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1478 try:
1479 self.report_download_webpage(video_id)
1480 webpage = urllib2.urlopen(request).read()
1481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1482 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1483 return
020f7150
RG
1484
1485 # Extract URL, uploader and title from webpage
1486 self.report_extraction(video_id)
18963a36 1487 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1488 if mobj is not None:
1489 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1490 video_extension = mediaURL[-3:]
d3975459 1491
c6c555cf
RG
1492 # Extract gdaKey if available
1493 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1494 if mobj is None:
1495 video_url = mediaURL
1496 else:
1497 gdaKey = mobj.group(1)
1498 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1499 else:
c6c555cf
RG
1500 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1501 if mobj is None:
1502 self._downloader.trouble(u'ERROR: unable to extract media URL')
1503 return
1504 vardict = parse_qs(mobj.group(1))
1505 if 'mediaData' not in vardict:
1506 self._downloader.trouble(u'ERROR: unable to extract media URL')
1507 return
1508 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1509 if mobj is None:
1510 self._downloader.trouble(u'ERROR: unable to extract media URL')
1511 return
6b57e8c5
RG
1512 mediaURL = mobj.group(1).replace('\\/', '/')
1513 video_extension = mediaURL[-3:]
1514 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1515
2546e767 1516 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1517 if mobj is None:
147753eb 1518 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1519 return
020f7150 1520 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1521 video_title = sanitize_title(video_title)
020f7150 1522
29f07568 1523 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1524 if mobj is None:
147753eb 1525 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1526 return
dbccb6cd 1527 video_uploader = mobj.group(1)
020f7150 1528
42bcd27d 1529 try:
1530 # Process video information
1531 self._downloader.process_info({
1532 'id': video_id.decode('utf-8'),
1533 'url': video_url.decode('utf-8'),
1534 'uploader': video_uploader.decode('utf-8'),
138b11f3 1535 'upload_date': u'NA',
42bcd27d 1536 'title': video_title,
1537 'stitle': simple_title,
1538 'ext': video_extension.decode('utf-8'),
6ba562b0 1539 'format': u'NA',
e616ec0c 1540 'player_url': None,
42bcd27d 1541 })
73f4e7af 1542 except UnavailableVideoError:
09cc744c 1543 self._downloader.trouble(u'\nERROR: unable to download video')
020f7150 1544
25af2bce 1545
4135fa45
WB
1546class DailymotionIE(InfoExtractor):
1547 """Information Extractor for Dailymotion"""
1548
1549 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
f3098c4d 1550 IE_NAME = u'dailymotion'
4135fa45
WB
1551
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1554
4135fa45
WB
1555 def report_download_webpage(self, video_id):
1556 """Report webpage download."""
331ce0a0 1557 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
d3975459 1558
4135fa45
WB
1559 def report_extraction(self, video_id):
1560 """Report information extraction."""
331ce0a0 1561 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45
WB
1562
1563 def _real_initialize(self):
1564 return
1565
4135fa45
WB
1566 def _real_extract(self, url):
1567 # Extract id and simplified title from URL
1568 mobj = re.match(self._VALID_URL, url)
1569 if mobj is None:
1570 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1571 return
1572
df372a65 1573 # At this point we have a new video
9bf7fa52 1574 self._downloader.increment_downloads()
4135fa45
WB
1575 video_id = mobj.group(1)
1576
1577 simple_title = mobj.group(2).decode('utf-8')
1578 video_extension = 'flv'
1579
1580 # Retrieve video webpage to extract further information
1581 request = urllib2.Request(url)
62a29bbf 1582 request.add_header('Cookie', 'family_filter=off')
4135fa45
WB
1583 try:
1584 self.report_download_webpage(video_id)
1585 webpage = urllib2.urlopen(request).read()
1586 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1588 return
1589
1590 # Extract URL, uploader and title from webpage
1591 self.report_extraction(video_id)
62a29bbf 1592 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
4135fa45
WB
1593 if mobj is None:
1594 self._downloader.trouble(u'ERROR: unable to extract media URL')
1595 return
62a29bbf 1596 sequence = urllib.unquote(mobj.group(1))
1597 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1598 if mobj is None:
1599 self._downloader.trouble(u'ERROR: unable to extract media URL')
1600 return
1601 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
4135fa45
WB
1602
1603 # if needed add http://www.dailymotion.com/ if relative URL
1604
1605 video_url = mediaURL
1606
62a29bbf 1607 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
4135fa45
WB
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: unable to extract title')
1610 return
1611 video_title = mobj.group(1).decode('utf-8')
1612 video_title = sanitize_title(video_title)
1613
62a29bbf 1614 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
4135fa45
WB
1615 if mobj is None:
1616 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1617 return
1618 video_uploader = mobj.group(1)
1619
1620 try:
1621 # Process video information
1622 self._downloader.process_info({
1623 'id': video_id.decode('utf-8'),
1624 'url': video_url.decode('utf-8'),
1625 'uploader': video_uploader.decode('utf-8'),
138b11f3 1626 'upload_date': u'NA',
4135fa45
WB
1627 'title': video_title,
1628 'stitle': simple_title,
1629 'ext': video_extension.decode('utf-8'),
1630 'format': u'NA',
1631 'player_url': None,
1632 })
73f4e7af 1633 except UnavailableVideoError:
09cc744c 1634 self._downloader.trouble(u'\nERROR: unable to download video')
4135fa45 1635
c0a10ca8 1636
49c0028a 1637class GoogleIE(InfoExtractor):
1638 """Information extractor for video.google.com."""
1639
490fd7ae 1640 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
f3098c4d 1641 IE_NAME = u'video.google'
49c0028a 1642
1643 def __init__(self, downloader=None):
1644 InfoExtractor.__init__(self, downloader)
1645
49c0028a 1646 def report_download_webpage(self, video_id):
1647 """Report webpage download."""
331ce0a0 1648 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1649
1650 def report_extraction(self, video_id):
1651 """Report information extraction."""
331ce0a0 1652 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1653
1654 def _real_initialize(self):
1655 return
1656
1657 def _real_extract(self, url):
1658 # Extract id from URL
1659 mobj = re.match(self._VALID_URL, url)
1660 if mobj is None:
1661 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662 return
1663
df372a65 1664 # At this point we have a new video
9bf7fa52 1665 self._downloader.increment_downloads()
49c0028a 1666 video_id = mobj.group(1)
1667
1668 video_extension = 'mp4'
1669
1670 # Retrieve video webpage to extract further information
490fd7ae 1671 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1672 try:
1673 self.report_download_webpage(video_id)
1674 webpage = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677 return
1678
1679 # Extract URL, uploader, and title from webpage
1680 self.report_extraction(video_id)
490fd7ae
RG
1681 mobj = re.search(r"download_url:'([^']+)'", webpage)
1682 if mobj is None:
1683 video_extension = 'flv'
1684 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1685 if mobj is None:
1686 self._downloader.trouble(u'ERROR: unable to extract media URL')
1687 return
1688 mediaURL = urllib.unquote(mobj.group(1))
1689 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1690 mediaURL = mediaURL.replace('\\x26', '\x26')
1691
1692 video_url = mediaURL
1693
1694 mobj = re.search(r'<title>(.*)</title>', webpage)
1695 if mobj is None:
1696 self._downloader.trouble(u'ERROR: unable to extract title')
1697 return
1698 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1699 video_title = sanitize_title(video_title)
31cbdaaf 1700 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1701
7e58d568
RG
1702 # Extract video description
1703 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1704 if mobj is None:
1705 self._downloader.trouble(u'ERROR: unable to extract video description')
1706 return
1707 video_description = mobj.group(1).decode('utf-8')
1708 if not video_description:
1709 video_description = 'No description available.'
1710
1711 # Extract video thumbnail
1712 if self._downloader.params.get('forcethumbnail', False):
1713 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1714 try:
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718 return
1719 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1720 if mobj is None:
1721 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1722 return
1723 video_thumbnail = mobj.group(1)
1724 else: # we need something to pass to process_info
1725 video_thumbnail = ''
1726
49c0028a 1727 try:
1728 # Process video information
1729 self._downloader.process_info({
1730 'id': video_id.decode('utf-8'),
1731 'url': video_url.decode('utf-8'),
6ba562b0 1732 'uploader': u'NA',
138b11f3 1733 'upload_date': u'NA',
490fd7ae 1734 'title': video_title,
31cbdaaf 1735 'stitle': simple_title,
49c0028a 1736 'ext': video_extension.decode('utf-8'),
6ba562b0 1737 'format': u'NA',
e616ec0c 1738 'player_url': None,
49c0028a 1739 })
73f4e7af 1740 except UnavailableVideoError:
09cc744c 1741 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1742
1743
1744class PhotobucketIE(InfoExtractor):
1745 """Information extractor for photobucket.com."""
1746
1747 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
f3098c4d 1748 IE_NAME = u'photobucket'
49c0028a 1749
1750 def __init__(self, downloader=None):
1751 InfoExtractor.__init__(self, downloader)
1752
49c0028a 1753 def report_download_webpage(self, video_id):
1754 """Report webpage download."""
331ce0a0 1755 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1756
1757 def report_extraction(self, video_id):
1758 """Report information extraction."""
331ce0a0 1759 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1760
1761 def _real_initialize(self):
1762 return
1763
1764 def _real_extract(self, url):
1765 # Extract id from URL
1766 mobj = re.match(self._VALID_URL, url)
1767 if mobj is None:
1768 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1769 return
1770
df372a65 1771 # At this point we have a new video
9bf7fa52 1772 self._downloader.increment_downloads()
49c0028a 1773 video_id = mobj.group(1)
1774
1775 video_extension = 'flv'
1776
1777 # Retrieve video webpage to extract further information
1778 request = urllib2.Request(url)
1779 try:
1780 self.report_download_webpage(video_id)
1781 webpage = urllib2.urlopen(request).read()
1782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1784 return
1785
1786 # Extract URL, uploader, and title from webpage
1787 self.report_extraction(video_id)
1788 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1789 if mobj is None:
1790 self._downloader.trouble(u'ERROR: unable to extract media URL')
1791 return
1792 mediaURL = urllib.unquote(mobj.group(1))
1793
1794 video_url = mediaURL
1795
1796 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1797 if mobj is None:
1798 self._downloader.trouble(u'ERROR: unable to extract title')
1799 return
1800 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1801 video_title = sanitize_title(video_title)
31cbdaaf 1802 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
49c0028a 1803
1804 video_uploader = mobj.group(2).decode('utf-8')
1805
1806 try:
1807 # Process video information
1808 self._downloader.process_info({
1809 'id': video_id.decode('utf-8'),
1810 'url': video_url.decode('utf-8'),
490fd7ae 1811 'uploader': video_uploader,
138b11f3 1812 'upload_date': u'NA',
490fd7ae 1813 'title': video_title,
31cbdaaf 1814 'stitle': simple_title,
490fd7ae 1815 'ext': video_extension.decode('utf-8'),
6ba562b0 1816 'format': u'NA',
e616ec0c 1817 'player_url': None,
490fd7ae 1818 })
73f4e7af 1819 except UnavailableVideoError:
09cc744c 1820 self._downloader.trouble(u'\nERROR: unable to download video')
490fd7ae
RG
1821
1822
61945318
RG
1823class YahooIE(InfoExtractor):
1824 """Information extractor for video.yahoo.com."""
1825
1826 # _VALID_URL matches all Yahoo! Video URLs
1827 # _VPAGE_URL matches only the extractable '/watch/' URLs
1828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
f3098c4d 1830 IE_NAME = u'video.yahoo'
61945318
RG
1831
1832 def __init__(self, downloader=None):
1833 InfoExtractor.__init__(self, downloader)
1834
61945318
RG
1835 def report_download_webpage(self, video_id):
1836 """Report webpage download."""
331ce0a0 1837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1838
1839 def report_extraction(self, video_id):
1840 """Report information extraction."""
331ce0a0 1841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318
RG
1842
1843 def _real_initialize(self):
1844 return
1845
df372a65 1846 def _real_extract(self, url, new_video=True):
61945318
RG
1847 # Extract ID from URL
1848 mobj = re.match(self._VALID_URL, url)
1849 if mobj is None:
1850 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1851 return
1852
df372a65 1853 # At this point we have a new video
9bf7fa52 1854 self._downloader.increment_downloads()
61945318
RG
1855 video_id = mobj.group(2)
1856 video_extension = 'flv'
1857
1858 # Rewrite valid but non-extractable URLs as
1859 # extractable English language /watch/ URLs
1860 if re.match(self._VPAGE_URL, url) is None:
1861 request = urllib2.Request(url)
1862 try:
1863 webpage = urllib2.urlopen(request).read()
1864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866 return
1867
1868 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1869 if mobj is None:
1870 self._downloader.trouble(u'ERROR: Unable to extract id field')
1871 return
1872 yahoo_id = mobj.group(1)
1873
1874 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1875 if mobj is None:
1876 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1877 return
1878 yahoo_vid = mobj.group(1)
1879
1880 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1881 return self._real_extract(url, new_video=False)
61945318
RG
1882
1883 # Retrieve video webpage to extract further information
1884 request = urllib2.Request(url)
1885 try:
1886 self.report_download_webpage(video_id)
1887 webpage = urllib2.urlopen(request).read()
1888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1890 return
1891
1892 # Extract uploader and title from webpage
1893 self.report_extraction(video_id)
1894 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1895 if mobj is None:
1896 self._downloader.trouble(u'ERROR: unable to extract video title')
1897 return
1898 video_title = mobj.group(1).decode('utf-8')
1899 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1900
1901 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1902 if mobj is None:
1903 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1904 return
1905 video_uploader = mobj.group(1).decode('utf-8')
1906
7e58d568
RG
1907 # Extract video thumbnail
1908 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1909 if mobj is None:
1910 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1911 return
1912 video_thumbnail = mobj.group(1).decode('utf-8')
1913
1914 # Extract video description
1915 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1916 if mobj is None:
1917 self._downloader.trouble(u'ERROR: unable to extract video description')
1918 return
1919 video_description = mobj.group(1).decode('utf-8')
c0a10ca8
F
1920 if not video_description:
1921 video_description = 'No description available.'
7e58d568 1922
61945318
RG
1923 # Extract video height and width
1924 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1925 if mobj is None:
1926 self._downloader.trouble(u'ERROR: unable to extract video height')
1927 return
1928 yv_video_height = mobj.group(1)
1929
1930 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1931 if mobj is None:
1932 self._downloader.trouble(u'ERROR: unable to extract video width')
1933 return
1934 yv_video_width = mobj.group(1)
1935
1936 # Retrieve video playlist to extract media URL
1937 # I'm not completely sure what all these options are, but we
1938 # seem to need most of them, otherwise the server sends a 401.
1939 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1940 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1941 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
c0a10ca8
F
1942 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1943 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
61945318
RG
1944 try:
1945 self.report_download_webpage(video_id)
1946 webpage = urllib2.urlopen(request).read()
1947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1949 return
1950
1951 # Extract media URL from playlist XML
1952 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1953 if mobj is None:
1954 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1955 return
1956 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1957 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1958
1959 try:
1960 # Process video information
1961 self._downloader.process_info({
1962 'id': video_id.decode('utf-8'),
1963 'url': video_url,
1964 'uploader': video_uploader,
138b11f3 1965 'upload_date': u'NA',
61945318
RG
1966 'title': video_title,
1967 'stitle': simple_title,
1968 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1969 'thumbnail': video_thumbnail.decode('utf-8'),
1970 'description': video_description,
1971 'thumbnail': video_thumbnail,
e616ec0c 1972 'player_url': None,
61945318 1973 })
73f4e7af 1974 except UnavailableVideoError:
09cc744c 1975 self._downloader.trouble(u'\nERROR: unable to download video')
61945318
RG
1976
1977
92743d42
RB
1978class VimeoIE(InfoExtractor):
1979 """Information extractor for vimeo.com."""
1980
1981 # _VALID_URL matches Vimeo URLs
44c636df 1982 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
f3098c4d 1983 IE_NAME = u'vimeo'
92743d42
RB
1984
1985 def __init__(self, downloader=None):
1986 InfoExtractor.__init__(self, downloader)
1987
92743d42
RB
1988 def report_download_webpage(self, video_id):
1989 """Report webpage download."""
0ecedbdb 1990 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
92743d42
RB
1991
1992 def report_extraction(self, video_id):
1993 """Report information extraction."""
0ecedbdb 1994 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
92743d42
RB
1995
1996 def _real_initialize(self):
1997 return
1998
1999 def _real_extract(self, url, new_video=True):
2000 # Extract ID from URL
2001 mobj = re.match(self._VALID_URL, url)
2002 if mobj is None:
2003 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2004 return
2005
2006 # At this point we have a new video
2007 self._downloader.increment_downloads()
2008 video_id = mobj.group(1)
92743d42
RB
2009
2010 # Retrieve video webpage to extract further information
2011 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2012 try:
2013 self.report_download_webpage(video_id)
2014 webpage = urllib2.urlopen(request).read()
2015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017 return
2018
f24c674b
RB
2019 # Now we begin extracting as much information as we can from what we
2020 # retrieved. First we extract the information common to all extractors,
2021 # and latter we extract those that are Vimeo specific.
92743d42 2022 self.report_extraction(video_id)
f24c674b
RB
2023
2024 # Extract title
c5a088d3 2025 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
92743d42
RB
2026 if mobj is None:
2027 self._downloader.trouble(u'ERROR: unable to extract video title')
2028 return
2029 video_title = mobj.group(1).decode('utf-8')
2030 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2031
f24c674b 2032 # Extract uploader
c5a088d3 2033 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
92743d42
RB
2034 if mobj is None:
2035 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2036 return
2037 video_uploader = mobj.group(1).decode('utf-8')
2038
2039 # Extract video thumbnail
c5a088d3 2040 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
92743d42
RB
2041 if mobj is None:
2042 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2043 return
2044 video_thumbnail = mobj.group(1).decode('utf-8')
2045
2046 # # Extract video description
2047 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2048 # if mobj is None:
2049 # self._downloader.trouble(u'ERROR: unable to extract video description')
2050 # return
2051 # video_description = mobj.group(1).decode('utf-8')
2052 # if not video_description: video_description = 'No description available.'
2053 video_description = 'Foo.'
2054
f24c674b 2055 # Vimeo specific: extract request signature
c5a088d3 2056 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
92743d42
RB
2057 if mobj is None:
2058 self._downloader.trouble(u'ERROR: unable to extract request signature')
2059 return
2060 sig = mobj.group(1).decode('utf-8')
2061
f24c674b 2062 # Vimeo specific: Extract request signature expiration
c5a088d3 2063 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
92743d42
RB
2064 if mobj is None:
2065 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2066 return
2067 sig_exp = mobj.group(1).decode('utf-8')
2068
2069 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2070
2071 try:
2072 # Process video information
2073 self._downloader.process_info({
2074 'id': video_id.decode('utf-8'),
2075 'url': video_url,
2076 'uploader': video_uploader,
2077 'upload_date': u'NA',
2078 'title': video_title,
2079 'stitle': simple_title,
2fc31a48 2080 'ext': u'mp4',
92743d42
RB
2081 'thumbnail': video_thumbnail.decode('utf-8'),
2082 'description': video_description,
2083 'thumbnail': video_thumbnail,
2084 'description': video_description,
2085 'player_url': None,
2086 })
2087 except UnavailableVideoError:
2088 self._downloader.trouble(u'ERROR: unable to download video')
2089
2090
490fd7ae
RG
2091class GenericIE(InfoExtractor):
2092 """Generic last-resort information extractor."""
2093
f3098c4d
PH
2094 _VALID_URL = r'.*'
2095 IE_NAME = u'generic'
bdb3f7a7 2096
490fd7ae
RG
2097 def __init__(self, downloader=None):
2098 InfoExtractor.__init__(self, downloader)
2099
490fd7ae
RG
2100 def report_download_webpage(self, video_id):
2101 """Report webpage download."""
331ce0a0
RG
2102 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2103 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
2104
2105 def report_extraction(self, video_id):
2106 """Report information extraction."""
331ce0a0 2107 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae
RG
2108
2109 def _real_initialize(self):
2110 return
2111
2112 def _real_extract(self, url):
df372a65 2113 # At this point we have a new video
9bf7fa52 2114 self._downloader.increment_downloads()
df372a65 2115
490fd7ae
RG
2116 video_id = url.split('/')[-1]
2117 request = urllib2.Request(url)
2118 try:
2119 self.report_download_webpage(video_id)
2120 webpage = urllib2.urlopen(request).read()
2121 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2122 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2123 return
2124 except ValueError, err:
2125 # since this is the last-resort InfoExtractor, if
2126 # this error is thrown, it'll be thrown here
2127 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2128 return
2129
a9806fd8 2130 self.report_extraction(video_id)
490fd7ae
RG
2131 # Start with something easy: JW Player in SWFObject
2132 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2133 if mobj is None:
2134 # Broaden the search a little bit
2135 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2136 if mobj is None:
2137 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2138 return
2139
2140 # It's possible that one of the regexes
2141 # matched, but returned an empty group:
2142 if mobj.group(1) is None:
2143 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2144 return
2145
2146 video_url = urllib.unquote(mobj.group(1))
c0a10ca8 2147 video_id = os.path.basename(video_url)
490fd7ae
RG
2148
2149 # here's a fun little line of code for you:
2150 video_extension = os.path.splitext(video_id)[1][1:]
c0a10ca8 2151 video_id = os.path.splitext(video_id)[0]
490fd7ae
RG
2152
2153 # it's tempting to parse this further, but you would
2154 # have to take into account all the variations like
2155 # Video Title - Site Name
2156 # Site Name | Video Title
2157 # Video Title - Tagline | Site Name
2158 # and so on and so forth; it's just not practical
2159 mobj = re.search(r'<title>(.*)</title>', webpage)
2160 if mobj is None:
2161 self._downloader.trouble(u'ERROR: unable to extract title')
2162 return
2163 video_title = mobj.group(1).decode('utf-8')
2164 video_title = sanitize_title(video_title)
31cbdaaf 2165 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
490fd7ae
RG
2166
2167 # video uploader is domain name
2168 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2169 if mobj is None:
2170 self._downloader.trouble(u'ERROR: unable to extract title')
2171 return
2172 video_uploader = mobj.group(1).decode('utf-8')
2173
2174 try:
2175 # Process video information
2176 self._downloader.process_info({
2177 'id': video_id.decode('utf-8'),
2178 'url': video_url.decode('utf-8'),
2179 'uploader': video_uploader,
138b11f3 2180 'upload_date': u'NA',
490fd7ae 2181 'title': video_title,
31cbdaaf 2182 'stitle': simple_title,
49c0028a 2183 'ext': video_extension.decode('utf-8'),
6ba562b0 2184 'format': u'NA',
e616ec0c 2185 'player_url': None,
49c0028a 2186 })
73f4e7af 2187 except UnavailableVideoError, err:
09cc744c 2188 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 2189
2190
25af2bce
RG
2191class YoutubeSearchIE(InfoExtractor):
2192 """Information Extractor for YouTube search queries."""
bdb3f7a7 2193 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
25af2bce
RG
2194 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2195 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 2196 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 2197 _youtube_ie = None
fd9288c3 2198 _max_youtube_results = 1000
f3098c4d 2199 IE_NAME = u'youtube:search'
25af2bce 2200
f995f712 2201 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
2202 InfoExtractor.__init__(self, downloader)
2203 self._youtube_ie = youtube_ie
d3975459 2204
25af2bce
RG
2205 def report_download_page(self, query, pagenum):
2206 """Report attempt to download playlist page with given number."""
490fd7ae 2207 query = query.decode(preferredencoding())
331ce0a0 2208 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
2209
2210 def _real_initialize(self):
2211 self._youtube_ie.initialize()
d3975459 2212
25af2bce 2213 def _real_extract(self, query):
bdb3f7a7 2214 mobj = re.match(self._VALID_URL, query)
25af2bce 2215 if mobj is None:
147753eb 2216 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 2217 return
25af2bce
RG
2218
2219 prefix, query = query.split(':')
2220 prefix = prefix[8:]
c0a10ca8 2221 query = query.encode('utf-8')
f995f712 2222 if prefix == '':
6f21f686
RG
2223 self._download_n_results(query, 1)
2224 return
f995f712 2225 elif prefix == 'all':
6f21f686
RG
2226 self._download_n_results(query, self._max_youtube_results)
2227 return
f995f712 2228 else:
25af2bce 2229 try:
e1f18b8a 2230 n = long(prefix)
25af2bce 2231 if n <= 0:
147753eb 2232 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 2233 return
257453b9 2234 elif n > self._max_youtube_results:
c0a10ca8 2235 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 2236 n = self._max_youtube_results
6f21f686
RG
2237 self._download_n_results(query, n)
2238 return
e1f18b8a 2239 except ValueError: # parsing prefix as integer fails
6f21f686
RG
2240 self._download_n_results(query, 1)
2241 return
25af2bce
RG
2242
2243 def _download_n_results(self, query, n):
2244 """Downloads a specified number of results for a query"""
2245
2246 video_ids = []
2247 already_seen = set()
2248 pagenum = 1
2249
2250 while True:
2251 self.report_download_page(query, pagenum)
a9633f14 2252 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2253 request = urllib2.Request(result_url)
25af2bce
RG
2254 try:
2255 page = urllib2.urlopen(request).read()
2256 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2257 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2258 return
25af2bce
RG
2259
2260 # Extract video identifiers
2261 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2262 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2263 if video_id not in already_seen:
2264 video_ids.append(video_id)
2265 already_seen.add(video_id)
2266 if len(video_ids) == n:
2267 # Specified n videos reached
25af2bce 2268 for id in video_ids:
6f21f686
RG
2269 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2270 return
25af2bce 2271
304a4d85 2272 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 2273 for id in video_ids:
6f21f686
RG
2274 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2275 return
25af2bce
RG
2276
2277 pagenum = pagenum + 1
2278
c0a10ca8 2279
7e58d568
RG
2280class GoogleSearchIE(InfoExtractor):
2281 """Information Extractor for Google Video search queries."""
bdb3f7a7 2282 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
7e58d568
RG
2283 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2284 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2285 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2286 _google_ie = None
2287 _max_google_results = 1000
f3098c4d 2288 IE_NAME = u'video.google:search'
7e58d568
RG
2289
2290 def __init__(self, google_ie, downloader=None):
2291 InfoExtractor.__init__(self, downloader)
2292 self._google_ie = google_ie
d3975459 2293
7e58d568
RG
2294 def report_download_page(self, query, pagenum):
2295 """Report attempt to download playlist page with given number."""
2296 query = query.decode(preferredencoding())
331ce0a0 2297 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2298
2299 def _real_initialize(self):
2300 self._google_ie.initialize()
d3975459 2301
7e58d568 2302 def _real_extract(self, query):
bdb3f7a7 2303 mobj = re.match(self._VALID_URL, query)
7e58d568
RG
2304 if mobj is None:
2305 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2306 return
2307
2308 prefix, query = query.split(':')
2309 prefix = prefix[8:]
c0a10ca8 2310 query = query.encode('utf-8')
7e58d568
RG
2311 if prefix == '':
2312 self._download_n_results(query, 1)
2313 return
2314 elif prefix == 'all':
2315 self._download_n_results(query, self._max_google_results)
2316 return
2317 else:
2318 try:
2319 n = long(prefix)
2320 if n <= 0:
2321 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2322 return
2323 elif n > self._max_google_results:
c0a10ca8 2324 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
7e58d568
RG
2325 n = self._max_google_results
2326 self._download_n_results(query, n)
2327 return
2328 except ValueError: # parsing prefix as integer fails
2329 self._download_n_results(query, 1)
2330 return
2331
2332 def _download_n_results(self, query, n):
2333 """Downloads a specified number of results for a query"""
2334
2335 video_ids = []
2336 already_seen = set()
2337 pagenum = 1
2338
2339 while True:
2340 self.report_download_page(query, pagenum)
2341 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2342 request = urllib2.Request(result_url)
7e58d568
RG
2343 try:
2344 page = urllib2.urlopen(request).read()
2345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2346 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2347 return
2348
2349 # Extract video identifiers
2350 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2351 video_id = mobj.group(1)
2352 if video_id not in already_seen:
2353 video_ids.append(video_id)
2354 already_seen.add(video_id)
2355 if len(video_ids) == n:
2356 # Specified n videos reached
2357 for id in video_ids:
2358 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2359 return
2360
2361 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2362 for id in video_ids:
2363 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2364 return
2365
2366 pagenum = pagenum + 1
2367
c0a10ca8 2368
7e58d568
RG
2369class YahooSearchIE(InfoExtractor):
2370 """Information Extractor for Yahoo! Video search queries."""
bdb3f7a7 2371 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
7e58d568
RG
2372 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2373 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2374 _MORE_PAGES_INDICATOR = r'\s*Next'
2375 _yahoo_ie = None
2376 _max_yahoo_results = 1000
f3098c4d 2377 IE_NAME = u'video.yahoo:search'
7e58d568
RG
2378
2379 def __init__(self, yahoo_ie, downloader=None):
2380 InfoExtractor.__init__(self, downloader)
2381 self._yahoo_ie = yahoo_ie
d3975459 2382
7e58d568
RG
2383 def report_download_page(self, query, pagenum):
2384 """Report attempt to download playlist page with given number."""
2385 query = query.decode(preferredencoding())
331ce0a0 2386 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2387
2388 def _real_initialize(self):
2389 self._yahoo_ie.initialize()
d3975459 2390
7e58d568 2391 def _real_extract(self, query):
bdb3f7a7 2392 mobj = re.match(self._VALID_URL, query)
7e58d568
RG
2393 if mobj is None:
2394 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2395 return
2396
2397 prefix, query = query.split(':')
2398 prefix = prefix[8:]
c0a10ca8 2399 query = query.encode('utf-8')
7e58d568
RG
2400 if prefix == '':
2401 self._download_n_results(query, 1)
2402 return
2403 elif prefix == 'all':
2404 self._download_n_results(query, self._max_yahoo_results)
2405 return
2406 else:
2407 try:
2408 n = long(prefix)
2409 if n <= 0:
2410 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2411 return
2412 elif n > self._max_yahoo_results:
c0a10ca8 2413 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
7e58d568
RG
2414 n = self._max_yahoo_results
2415 self._download_n_results(query, n)
2416 return
2417 except ValueError: # parsing prefix as integer fails
2418 self._download_n_results(query, 1)
2419 return
2420
2421 def _download_n_results(self, query, n):
2422 """Downloads a specified number of results for a query"""
2423
2424 video_ids = []
2425 already_seen = set()
2426 pagenum = 1
2427
2428 while True:
2429 self.report_download_page(query, pagenum)
2430 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2431 request = urllib2.Request(result_url)
7e58d568
RG
2432 try:
2433 page = urllib2.urlopen(request).read()
2434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2435 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2436 return
2437
2438 # Extract video identifiers
2439 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2440 video_id = mobj.group(1)
2441 if video_id not in already_seen:
2442 video_ids.append(video_id)
2443 already_seen.add(video_id)
2444 if len(video_ids) == n:
2445 # Specified n videos reached
2446 for id in video_ids:
2447 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2448 return
2449
2450 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2451 for id in video_ids:
2452 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2453 return
2454
2455 pagenum = pagenum + 1
2456
c0a10ca8 2457
0c2dc87d
RG
2458class YoutubePlaylistIE(InfoExtractor):
2459 """Information Extractor for YouTube playlists."""
2460
7a2cf545 2461 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
f74e22ae 2462 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
0c2dc87d 2463 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 2464 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d 2465 _youtube_ie = None
f3098c4d 2466 IE_NAME = u'youtube:playlist'
0c2dc87d
RG
2467
2468 def __init__(self, youtube_ie, downloader=None):
2469 InfoExtractor.__init__(self, downloader)
2470 self._youtube_ie = youtube_ie
d3975459 2471
0c2dc87d
RG
2472 def report_download_page(self, playlist_id, pagenum):
2473 """Report attempt to download playlist page with given number."""
331ce0a0 2474 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
2475
2476 def _real_initialize(self):
2477 self._youtube_ie.initialize()
d3975459 2478
0c2dc87d
RG
2479 def _real_extract(self, url):
2480 # Extract playlist id
2481 mobj = re.match(self._VALID_URL, url)
2482 if mobj is None:
147753eb 2483 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 2484 return
0c2dc87d 2485
d119b54d
RG
2486 # Single video case
2487 if mobj.group(3) is not None:
2488 self._youtube_ie.extract(mobj.group(3))
2489 return
2490
0c2dc87d 2491 # Download playlist pages
f74e22ae
GI
2492 # prefix is 'p' as default for playlists but there are other types that need extra care
2493 playlist_prefix = mobj.group(1)
2494 if playlist_prefix == 'a':
2495 playlist_access = 'artist'
2496 else:
7cc3c6fd 2497 playlist_prefix = 'p'
f74e22ae
GI
2498 playlist_access = 'view_play_list'
2499 playlist_id = mobj.group(2)
0c2dc87d
RG
2500 video_ids = []
2501 pagenum = 1
2502
2503 while True:
2504 self.report_download_page(playlist_id, pagenum)
f74e22ae 2505 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
0c2dc87d
RG
2506 try:
2507 page = urllib2.urlopen(request).read()
2508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2509 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2510 return
0c2dc87d
RG
2511
2512 # Extract video identifiers
27d98b6e 2513 ids_in_page = []
0c2dc87d 2514 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
2515 if mobj.group(1) not in ids_in_page:
2516 ids_in_page.append(mobj.group(1))
2517 video_ids.extend(ids_in_page)
0c2dc87d 2518
ce5cafea 2519 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
2520 break
2521 pagenum = pagenum + 1
2522
8cc44341
RG
2523 playliststart = self._downloader.params.get('playliststart', 1) - 1
2524 playlistend = self._downloader.params.get('playlistend', -1)
2525 video_ids = video_ids[playliststart:playlistend]
2526
0c2dc87d 2527 for id in video_ids:
6f21f686
RG
2528 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2529 return
0c2dc87d 2530
c0a10ca8 2531
c39c05cd
A
2532class YoutubeUserIE(InfoExtractor):
2533 """Information Extractor for YouTube users."""
2534
b845d58b 2535 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
c39c05cd 2536 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
5aba6ea4
RG
2537 _GDATA_PAGE_SIZE = 50
2538 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2539 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
c39c05cd 2540 _youtube_ie = None
f3098c4d 2541 IE_NAME = u'youtube:user'
c39c05cd
A
2542
2543 def __init__(self, youtube_ie, downloader=None):
2544 InfoExtractor.__init__(self, downloader)
2545 self._youtube_ie = youtube_ie
d3975459 2546
5aba6ea4 2547 def report_download_page(self, username, start_index):
c39c05cd 2548 """Report attempt to download user page."""
5aba6ea4 2549 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
c0a10ca8 2550 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
c39c05cd
A
2551
2552 def _real_initialize(self):
2553 self._youtube_ie.initialize()
d3975459 2554
c39c05cd
A
2555 def _real_extract(self, url):
2556 # Extract username
2557 mobj = re.match(self._VALID_URL, url)
2558 if mobj is None:
2559 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2560 return
2561
c39c05cd 2562 username = mobj.group(1)
5aba6ea4
RG
2563
2564 # Download video ids using YouTube Data API. Result size per
2565 # query is limited (currently to 50 videos) so we need to query
2566 # page by page until there are no video ids - it means we got
2567 # all of them.
2568
c39c05cd 2569 video_ids = []
5aba6ea4 2570 pagenum = 0
c39c05cd 2571
5aba6ea4
RG
2572 while True:
2573 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2574 self.report_download_page(username, start_index)
c39c05cd 2575
5aba6ea4 2576 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
c39c05cd 2577
5aba6ea4
RG
2578 try:
2579 page = urllib2.urlopen(request).read()
2580 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2581 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2582 return
c39c05cd 2583
5aba6ea4
RG
2584 # Extract video identifiers
2585 ids_in_page = []
2586
2587 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2588 if mobj.group(1) not in ids_in_page:
2589 ids_in_page.append(mobj.group(1))
2590
2591 video_ids.extend(ids_in_page)
2592
2593 # A little optimization - if current page is not
2594 # "full", ie. does not contain PAGE_SIZE video ids then
2595 # we can assume that this page is the last one - there
2596 # are no more ids on further pages - no need to query
2597 # again.
2598
2599 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2600 break
2601
2602 pagenum += 1
2603
2604 all_ids_count = len(video_ids)
8cc44341
RG
2605 playliststart = self._downloader.params.get('playliststart', 1) - 1
2606 playlistend = self._downloader.params.get('playlistend', -1)
204c9398 2607
5aba6ea4
RG
2608 if playlistend == -1:
2609 video_ids = video_ids[playliststart:]
2610 else:
2611 video_ids = video_ids[playliststart:playlistend]
7a9054ec 2612
5aba6ea4 2613 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
c0a10ca8 2614 (username, all_ids_count, len(video_ids)))
5aba6ea4
RG
2615
2616 for video_id in video_ids:
2617 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2618
c39c05cd 2619
27179cfd
VV
2620class DepositFilesIE(InfoExtractor):
2621 """Information extractor for depositfiles.com"""
2622
b845d58b 2623 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
f3098c4d 2624 IE_NAME = u'DepositFiles'
27179cfd
VV
2625
2626 def __init__(self, downloader=None):
2627 InfoExtractor.__init__(self, downloader)
2628
27179cfd
VV
2629 def report_download_webpage(self, file_id):
2630 """Report webpage download."""
2631 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2632
2633 def report_extraction(self, file_id):
2634 """Report information extraction."""
2635 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2636
2637 def _real_initialize(self):
2638 return
2639
2640 def _real_extract(self, url):
2641 # At this point we have a new file
2642 self._downloader.increment_downloads()
2643
2644 file_id = url.split('/')[-1]
2645 # Rebuild url in english locale
2646 url = 'http://depositfiles.com/en/files/' + file_id
2647
2648 # Retrieve file webpage with 'Free download' button pressed
2649 free_download_indication = { 'gateway_result' : '1' }
1987c232 2650 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
27179cfd
VV
2651 try:
2652 self.report_download_webpage(file_id)
2653 webpage = urllib2.urlopen(request).read()
2654 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2655 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2656 return
2657
2658 # Search for the real file URL
2659 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2660 if (mobj is None) or (mobj.group(1) is None):
2661 # Try to figure out reason of the error.
2662 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2663 if (mobj is not None) and (mobj.group(1) is not None):
2664 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2665 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2666 else:
2667 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2668 return
2669
2670 file_url = mobj.group(1)
2671 file_extension = os.path.splitext(file_url)[1][1:]
2672
2673 # Search for file title
2674 mobj = re.search(r'<b title="(.*?)">', webpage)
2675 if mobj is None:
2676 self._downloader.trouble(u'ERROR: unable to extract title')
2677 return
2678 file_title = mobj.group(1).decode('utf-8')
2679
2680 try:
2681 # Process file information
2682 self._downloader.process_info({
2683 'id': file_id.decode('utf-8'),
2684 'url': file_url.decode('utf-8'),
2685 'uploader': u'NA',
2686 'upload_date': u'NA',
2687 'title': file_title,
2688 'stitle': file_title,
2689 'ext': file_extension.decode('utf-8'),
2690 'format': u'NA',
2691 'player_url': None,
2692 })
2693 except UnavailableVideoError, err:
2694 self._downloader.trouble(u'ERROR: unable to download file')
2695
c0a10ca8 2696
9f5f9602
GI
2697class FacebookIE(InfoExtractor):
2698 """Information Extractor for Facebook"""
2699
b845d58b 2700 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
9f5f9602
GI
2701 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2702 _NETRC_MACHINE = 'facebook'
2703 _available_formats = ['highqual', 'lowqual']
2704 _video_extensions = {
2705 'highqual': 'mp4',
2706 'lowqual': 'mp4',
2707 }
f3098c4d 2708 IE_NAME = u'facebook'
9f5f9602
GI
2709
2710 def __init__(self, downloader=None):
2711 InfoExtractor.__init__(self, downloader)
2712
9f5f9602
GI
2713 def _reporter(self, message):
2714 """Add header and report message."""
2715 self._downloader.to_screen(u'[facebook] %s' % message)
2716
2717 def report_login(self):
2718 """Report attempt to log in."""
2719 self._reporter(u'Logging in')
2720
2721 def report_video_webpage_download(self, video_id):
2722 """Report attempt to download video webpage."""
2723 self._reporter(u'%s: Downloading video webpage' % video_id)
2724
2725 def report_information_extraction(self, video_id):
2726 """Report attempt to extract video information."""
2727 self._reporter(u'%s: Extracting video information' % video_id)
2728
2729 def _parse_page(self, video_webpage):
2730 """Extract video information from page"""
2731 # General data
2732 data = {'title': r'class="video_title datawrap">(.*?)</',
2733 'description': r'<div class="datawrap">(.*?)</div>',
2734 'owner': r'\("video_owner_name", "(.*?)"\)',
2735 'upload_date': r'data-date="(.*?)"',
2736 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2737 }
2738 video_info = {}
2739 for piece in data.keys():
2740 mobj = re.search(data[piece], video_webpage)
2741 if mobj is not None:
2742 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2743
2744 # Video urls
2745 video_urls = {}
2746 for fmt in self._available_formats:
2747 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2748 if mobj is not None:
2749 # URL is in a Javascript segment inside an escaped Unicode format within
2750 # the generally utf-8 page
2751 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2752 video_info['video_urls'] = video_urls
2753
2754 return video_info
2755
2756 def _real_initialize(self):
2757 if self._downloader is None:
2758 return
2759
2760 useremail = None
2761 password = None
2762 downloader_params = self._downloader.params
2763
2764 # Attempt to use provided username and password or .netrc data
2765 if downloader_params.get('username', None) is not None:
2766 useremail = downloader_params['username']
2767 password = downloader_params['password']
2768 elif downloader_params.get('usenetrc', False):
2769 try:
2770 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2771 if info is not None:
2772 useremail = info[0]
2773 password = info[2]
2774 else:
2775 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2776 except (IOError, netrc.NetrcParseError), err:
2777 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2778 return
2779
2780 if useremail is None:
2781 return
2782
2783 # Log in
2784 login_form = {
2785 'email': useremail,
2786 'pass': password,
2787 'login': 'Log+In'
2788 }
2789 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2790 try:
2791 self.report_login()
2792 login_results = urllib2.urlopen(request).read()
2793 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2794 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2795 return
2796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2797 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2798 return
2799
2800 def _real_extract(self, url):
2801 mobj = re.match(self._VALID_URL, url)
2802 if mobj is None:
2803 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2804 return
2805 video_id = mobj.group('ID')
2806
2807 # Get video webpage
2808 self.report_video_webpage_download(video_id)
2809 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2810 try:
2811 page = urllib2.urlopen(request)
2812 video_webpage = page.read()
2813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2815 return
2816
2817 # Start extracting information
2818 self.report_information_extraction(video_id)
2819
2820 # Extract information
2821 video_info = self._parse_page(video_webpage)
2822
2823 # uploader
2824 if 'owner' not in video_info:
2825 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2826 return
2827 video_uploader = video_info['owner']
2828
2829 # title
2830 if 'title' not in video_info:
2831 self._downloader.trouble(u'ERROR: unable to extract video title')
2832 return
2833 video_title = video_info['title']
2834 video_title = video_title.decode('utf-8')
2835 video_title = sanitize_title(video_title)
2836
2837 # simplified title
2838 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2839 simple_title = simple_title.strip(ur'_')
2840
2841 # thumbnail image
2842 if 'thumbnail' not in video_info:
2843 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2844 video_thumbnail = ''
2845 else:
2846 video_thumbnail = video_info['thumbnail']
2847
2848 # upload date
2849 upload_date = u'NA'
2850 if 'upload_date' in video_info:
2851 upload_time = video_info['upload_date']
2852 timetuple = email.utils.parsedate_tz(upload_time)
2853 if timetuple is not None:
2854 try:
2855 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2856 except:
2857 pass
2858
2859 # description
8b95c387 2860 video_description = video_info.get('description', 'No description available.')
9f5f9602
GI
2861
2862 url_map = video_info['video_urls']
2863 if len(url_map.keys()) > 0:
2864 # Decide which formats to download
2865 req_format = self._downloader.params.get('format', None)
2866 format_limit = self._downloader.params.get('format_limit', None)
2867
2868 if format_limit is not None and format_limit in self._available_formats:
2869 format_list = self._available_formats[self._available_formats.index(format_limit):]
2870 else:
2871 format_list = self._available_formats
2872 existing_formats = [x for x in format_list if x in url_map]
2873 if len(existing_formats) == 0:
2874 self._downloader.trouble(u'ERROR: no known formats available for video')
2875 return
2876 if req_format is None:
2877 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
c52b01f3
K
2878 elif req_format == 'worst':
2879 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
9f5f9602
GI
2880 elif req_format == '-1':
2881 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2882 else:
2883 # Specific format
2884 if req_format not in url_map:
2885 self._downloader.trouble(u'ERROR: requested format not available')
2886 return
2887 video_url_list = [(req_format, url_map[req_format])] # Specific format
2888
2889 for format_param, video_real_url in video_url_list:
2890
2891 # At this point we have a new video
2892 self._downloader.increment_downloads()
2893
2894 # Extension
2895 video_extension = self._video_extensions.get(format_param, 'mp4')
2896
9f5f9602
GI
2897 try:
2898 # Process video information
2899 self._downloader.process_info({
2900 'id': video_id.decode('utf-8'),
2901 'url': video_real_url.decode('utf-8'),
2902 'uploader': video_uploader.decode('utf-8'),
2903 'upload_date': upload_date,
2904 'title': video_title,
2905 'stitle': simple_title,
2906 'ext': video_extension.decode('utf-8'),
2907 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2908 'thumbnail': video_thumbnail.decode('utf-8'),
2909 'description': video_description.decode('utf-8'),
2910 'player_url': None,
2911 })
2912 except UnavailableVideoError, err:
2913 self._downloader.trouble(u'\nERROR: unable to download video')
2914
7745f5d8
PH
2915class BlipTVIE(InfoExtractor):
2916 """Information extractor for blip.tv"""
2917
1cab2c6d 2918 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
7745f5d8 2919 _URL_EXT = r'^.*\.([a-z0-9]+)$'
f3098c4d 2920 IE_NAME = u'blip.tv'
7745f5d8 2921
7745f5d8
PH
2922 def report_extraction(self, file_id):
2923 """Report information extraction."""
54f329fe
PH
2924 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2925
2926 def report_direct_download(self, title):
2927 """Report information extraction."""
2928 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
7745f5d8
PH
2929
2930 def _simplify_title(self, title):
2931 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2932 res = res.strip(ur'_')
2933 return res
2934
2935 def _real_extract(self, url):
2936 mobj = re.match(self._VALID_URL, url)
2937 if mobj is None:
2938 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2939 return
2940
1293ce58
PH
2941 if '?' in url:
2942 cchar = '&'
2943 else:
2944 cchar = '?'
2945 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
7745f5d8 2946 request = urllib2.Request(json_url)
aded78d9 2947 self.report_extraction(mobj.group(1))
54f329fe 2948 info = None
7745f5d8 2949 try:
54f329fe
PH
2950 urlh = urllib2.urlopen(request)
2951 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2952 basename = url.split('/')[-1]
2953 title,ext = os.path.splitext(basename)
2954 ext = ext.replace('.', '')
2955 self.report_direct_download(title)
2956 info = {
2957 'id': title,
2958 'url': url,
2959 'title': title,
2960 'stitle': self._simplify_title(title),
2961 'ext': ext,
2962 'urlhandle': urlh
2963 }
7745f5d8
PH
2964 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2965 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2966 return
54f329fe
PH
2967 if info is None: # Regular URL
2968 try:
2969 json_code = urlh.read()
2970 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2971 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2972 return
7745f5d8 2973
54f329fe
PH
2974 try:
2975 json_data = json.loads(json_code)
2976 if 'Post' in json_data:
2977 data = json_data['Post']
2978 else:
2979 data = json_data
2980
2981 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2982 video_url = data['media']['url']
2983 umobj = re.match(self._URL_EXT, video_url)
2984 if umobj is None:
2985 raise ValueError('Can not determine filename extension')
2986 ext = umobj.group(1)
2987
2988 info = {
2989 'id': data['item_id'],
2990 'url': video_url,
2991 'uploader': data['display_name'],
2992 'upload_date': upload_date,
2993 'title': data['title'],
2994 'stitle': self._simplify_title(data['title']),
2995 'ext': ext,
2996 'format': data['media']['mimeType'],
2997 'thumbnail': data['thumbnailUrl'],
2998 'description': data['description'],
2999 'player_url': data['embedUrl']
3000 }
3001 except (ValueError,KeyError), err:
3002 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3003 return
a1cab7ce 3004
54f329fe 3005 self._downloader.increment_downloads()
7745f5d8
PH
3006
3007 try:
3008 self._downloader.process_info(info)
3009 except UnavailableVideoError, err:
3010 self._downloader.trouble(u'\nERROR: unable to download video')
3011
3012
9b0a8bc1
PH
3013class MyVideoIE(InfoExtractor):
3014 """Information Extractor for myvideo.de."""
3015
3016 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
f3098c4d 3017 IE_NAME = u'myvideo'
9b0a8bc1
PH
3018
3019 def __init__(self, downloader=None):
3020 InfoExtractor.__init__(self, downloader)
3021
9b0a8bc1
PH
3022 def report_download_webpage(self, video_id):
3023 """Report webpage download."""
3024 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3025
3026 def report_extraction(self, video_id):
3027 """Report information extraction."""
3028 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3029
3030 def _real_initialize(self):
3031 return
3032
3033 def _real_extract(self,url):
3034 mobj = re.match(self._VALID_URL, url)
3035 if mobj is None:
3036 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3037 return
3038
3039 video_id = mobj.group(1)
3040 simple_title = mobj.group(2).decode('utf-8')
3041 # should actually not be necessary
3042 simple_title = sanitize_title(simple_title)
3043 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3044
3045 # Get video webpage
3046 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3047 try:
3048 self.report_download_webpage(video_id)
3049 webpage = urllib2.urlopen(request).read()
3050 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3052 return
3053
3054 self.report_extraction(video_id)
3055 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3056 webpage)
3057 if mobj is None:
3058 self._downloader.trouble(u'ERROR: unable to extract media URL')
3059 return
3060 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3061
3062 mobj = re.search('<title>([^<]+)</title>', webpage)
3063 if mobj is None:
3064 self._downloader.trouble(u'ERROR: unable to extract title')
3065 return
3066
3067 video_title = mobj.group(1)
3068 video_title = sanitize_title(video_title)
3069
3070 try:
9b0a8bc1
PH
3071 self._downloader.process_info({
3072 'id': video_id,
3073 'url': video_url,
3074 'uploader': u'NA',
3075 'upload_date': u'NA',
3076 'title': video_title,
3077 'stitle': simple_title,
3078 'ext': u'flv',
3079 'format': u'NA',
3080 'player_url': None,
3081 })
3082 except UnavailableVideoError:
3083 self._downloader.trouble(u'\nERROR: Unable to download video')
3084
c8e30044 3085class ComedyCentralIE(InfoExtractor):
f166bccc 3086 """Information extractor for The Daily Show and Colbert Report """
c8e30044 3087
f3098c4d
PH
3088 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3089 IE_NAME = u'comedycentral'
c8e30044 3090
c8e30044
PH
3091 def report_extraction(self, episode_id):
3092 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3093
3094 def report_config_download(self, episode_id):
3095 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3096
b487ef08
PH
3097 def report_index_download(self, episode_id):
3098 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3099
fedf9f39
PH
3100 def report_player_url(self, episode_id):
3101 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3102
c8e30044
PH
3103 def _simplify_title(self, title):
3104 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3105 res = res.strip(ur'_')
3106 return res
3107
3108 def _real_extract(self, url):
3109 mobj = re.match(self._VALID_URL, url)
3110 if mobj is None:
3111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3112 return
f166bccc
PH
3113
3114 if mobj.group('shortname'):
3115 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3116 url = 'http://www.thedailyshow.com/full-episodes/'
3117 else:
3118 url = 'http://www.colbertnation.com/full-episodes/'
3119 mobj = re.match(self._VALID_URL, url)
3120 assert mobj is not None
3121
3122 dlNewest = not mobj.group('episode')
3123 if dlNewest:
3124 epTitle = mobj.group('showname')
3125 else:
3126 epTitle = mobj.group('episode')
c8e30044
PH
3127
3128 req = urllib2.Request(url)
3129 self.report_extraction(epTitle)
3130 try:
f166bccc
PH
3131 htmlHandle = urllib2.urlopen(req)
3132 html = htmlHandle.read()
c8e30044
PH
3133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3135 return
f166bccc
PH
3136 if dlNewest:
3137 url = htmlHandle.geturl()
3138 mobj = re.match(self._VALID_URL, url)
3139 if mobj is None:
3140 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3141 return
3142 if mobj.group('episode') == '':
3143 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3144 return
3145 epTitle = mobj.group('episode')
c8e30044 3146
b487ef08 3147 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
c8e30044
PH
3148 if len(mMovieParams) == 0:
3149 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3150 return
b487ef08
PH
3151
3152 playerUrl_raw = mMovieParams[0][0]
fedf9f39
PH
3153 self.report_player_url(epTitle)
3154 try:
b487ef08
PH
3155 urlHandle = urllib2.urlopen(playerUrl_raw)
3156 playerUrl = urlHandle.geturl()
3157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3158 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3159 return
3160
3161 uri = mMovieParams[0][1]
3162 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3163 self.report_index_download(epTitle)
3164 try:
3165 indexXml = urllib2.urlopen(indexUrl).read()
fedf9f39 3166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
b487ef08 3167 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
fedf9f39 3168 return
fedf9f39 3169
b487ef08
PH
3170 idoc = xml.etree.ElementTree.fromstring(indexXml)
3171 itemEls = idoc.findall('.//item')
3172 for itemEl in itemEls:
3173 mediaId = itemEl.findall('./guid')[0].text
3174 shortMediaId = mediaId.split(':')[-1]
3175 showId = mediaId.split(':')[-2].replace('.com', '')
3176 officialTitle = itemEl.findall('./title')[0].text
3177 officialDate = itemEl.findall('./pubDate')[0].text
3178
c8e30044
PH
3179 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3180 urllib.urlencode({'uri': mediaId}))
3181 configReq = urllib2.Request(configUrl)
3182 self.report_config_download(epTitle)
3183 try:
3184 configXml = urllib2.urlopen(configReq).read()
3185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3186 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3187 return
46c8c432 3188
c8e30044
PH
3189 cdoc = xml.etree.ElementTree.fromstring(configXml)
3190 turls = []
3191 for rendition in cdoc.findall('.//rendition'):
3192 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3193 turls.append(finfo)
3194
a88bc6bb 3195 if len(turls) == 0:
b487ef08 3196 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
a88bc6bb
PH
3197 continue
3198
c8e30044
PH
3199 # For now, just pick the highest bitrate
3200 format,video_url = turls[-1]
3201
3202 self._downloader.increment_downloads()
a88bc6bb 3203
b487ef08 3204 effTitle = showId + '-' + epTitle
c8e30044 3205 info = {
b487ef08 3206 'id': shortMediaId,
c8e30044 3207 'url': video_url,
b487ef08
PH
3208 'uploader': showId,
3209 'upload_date': officialDate,
a88bc6bb
PH
3210 'title': effTitle,
3211 'stitle': self._simplify_title(effTitle),
c8e30044
PH
3212 'ext': 'mp4',
3213 'format': format,
3214 'thumbnail': None,
b487ef08
PH
3215 'description': officialTitle,
3216 'player_url': playerUrl
c8e30044 3217 }
46c8c432 3218
c8e30044
PH
3219 try:
3220 self._downloader.process_info(info)
3221 except UnavailableVideoError, err:
b487ef08 3222 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
a88bc6bb 3223 continue
c8e30044
PH
3224
3225
f9c68787
PH
3226class EscapistIE(InfoExtractor):
3227 """Information extractor for The Escapist """
3228
b845d58b 3229 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
f3098c4d 3230 IE_NAME = u'escapist'
f9c68787 3231
f9c68787
PH
3232 def report_extraction(self, showName):
3233 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3234
3235 def report_config_download(self, showName):
3236 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3237
3238 def _simplify_title(self, title):
3239 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3240 res = res.strip(ur'_')
3241 return res
3242
3243 def _real_extract(self, url):
3244 htmlParser = HTMLParser.HTMLParser()
3245
3246 mobj = re.match(self._VALID_URL, url)
3247 if mobj is None:
3248 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3249 return
3250 showName = mobj.group('showname')
3251 videoId = mobj.group('episode')
3252
3253 self.report_extraction(showName)
3254 try:
3255 webPage = urllib2.urlopen(url).read()
3256 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3257 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3258 return
3259
3260 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3261 description = htmlParser.unescape(descMatch.group(1))
3262 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3263 imgUrl = htmlParser.unescape(imgMatch.group(1))
3264 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3265 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3266 configUrlMatch = re.search('config=(.*)$', playerUrl)
3267 configUrl = urllib2.unquote(configUrlMatch.group(1))
3268
3269 self.report_config_download(showName)
3270 try:
3271 configJSON = urllib2.urlopen(configUrl).read()
3272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3273 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3274 return
3275
3276 # Technically, it's JavaScript, not JSON
3277 configJSON = configJSON.replace("'", '"')
3278
3279 try:
3280 config = json.loads(configJSON)
3281 except (ValueError,), err:
3282 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3283 return
3284
3285 playlist = config['playlist']
3286 videoUrl = playlist[1]['url']
3287
3288 self._downloader.increment_downloads()
3289 info = {
3290 'id': videoId,
3291 'url': videoUrl,
3292 'uploader': showName,
3293 'upload_date': None,
3294 'title': showName,
3295 'stitle': self._simplify_title(showName),
3296 'ext': 'flv',
3297 'format': 'flv',
3298 'thumbnail': imgUrl,
3299 'description': description,
3300 'player_url': playerUrl,
3301 }
3302
3303 try:
3304 self._downloader.process_info(info)
3305 except UnavailableVideoError, err:
3306 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3307
3308
8d89fbae
PH
3309class CollegeHumorIE(InfoExtractor):
3310 """Information extractor for collegehumor.com"""
3311
3312 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3313 IE_NAME = u'collegehumor'
3314
3315 def report_webpage(self, video_id):
3316 """Report information extraction."""
3317 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3318
3319 def report_extraction(self, video_id):
3320 """Report information extraction."""
3321 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3322
3323 def _simplify_title(self, title):
3324 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3325 res = res.strip(ur'_')
3326 return res
3327
3328 def _real_extract(self, url):
3329 htmlParser = HTMLParser.HTMLParser()
3330
3331 mobj = re.match(self._VALID_URL, url)
3332 if mobj is None:
3333 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3334 return
3335 video_id = mobj.group('videoid')
3336
3337 self.report_webpage(video_id)
3338 request = urllib2.Request(url)
3339 try:
3340 webpage = urllib2.urlopen(request).read()
3341 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3342 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3343 return
3344
3345 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3346 if m is None:
3347 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3348 return
3349 internal_video_id = m.group('internalvideoid')
3350
3351 info = {
3352 'id': video_id,
3353 'internal_id': internal_video_id,
3354 }
3355
3356 self.report_extraction(video_id)
3357 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3358 try:
3359 metaXml = urllib2.urlopen(xmlUrl).read()
3360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3361 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3362 return
3363
3364 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3365 try:
3366 videoNode = mdoc.findall('./video')[0]
3367 info['description'] = videoNode.findall('./description')[0].text
3368 info['title'] = videoNode.findall('./caption')[0].text
3369 info['stitle'] = self._simplify_title(info['title'])
3370 info['url'] = videoNode.findall('./file')[0].text
3371 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3372 info['ext'] = info['url'].rpartition('.')[2]
3373 info['format'] = info['ext']
3374 except IndexError:
3375 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3376 return
3377
3378 self._downloader.increment_downloads()
3379
3380 try:
3381 self._downloader.process_info(info)
3382 except UnavailableVideoError, err:
3383 self._downloader.trouble(u'\nERROR: unable to download video')
3384
f9c68787 3385
6501a06d
RB
3386class XVideosIE(InfoExtractor):
3387 """Information extractor for xvideos.com"""
3388
3389 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3390 IE_NAME = u'xvideos'
3391
3392 def report_webpage(self, video_id):
3393 """Report information extraction."""
3394 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3395
3396 def report_extraction(self, video_id):
3397 """Report information extraction."""
3398 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3399
3400 def _simplify_title(self, title):
3401 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3402 res = res.strip(ur'_')
3403 return res
3404
3405 def _real_extract(self, url):
3406 htmlParser = HTMLParser.HTMLParser()
3407
3408 mobj = re.match(self._VALID_URL, url)
3409 if mobj is None:
3410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3411 return
3412 video_id = mobj.group(1).decode('utf-8')
3413
3414 self.report_webpage(video_id)
3415
3416 request = urllib2.Request(url)
3417 try:
3418 webpage = urllib2.urlopen(request).read()
3419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3420 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3421 return
3422
3423 self.report_extraction(video_id)
3424
3425
3426 # Extract video URL
3427 mobj = re.search(r'flv_url=(.+?)&', webpage)
3428 if mobj is None:
3429 self._downloader.trouble(u'ERROR: unable to extract video title')
3430 return
3431 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3432
3433
3434 # Extract title
3435 mobj = re.search(r'<title>(.*?)</title>', webpage)
3436 if mobj is None:
3437 self._downloader.trouble(u'ERROR: unable to extract video title')
3438 return
3439 video_title = mobj.group(1).decode('utf-8')
3440
3441
3442 # Extract video thumbnail
3443 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3444 if mobj is None:
3445 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3446 return
3447 video_thumbnail = mobj.group(1).decode('utf-8')
3448
3449
3450
3451 self._downloader.increment_downloads()
3452 info = {
3453 'id': video_id,
3454 'url': video_url,
3455 'uploader': None,
3456 'upload_date': None,
3457 'title': video_title,
3458 'stitle': self._simplify_title(video_title),
3459 'ext': 'flv',
3460 'format': 'flv',
3461 'thumbnail': video_thumbnail,
3462 'description': None,
3463 'player_url': None,
3464 }
3465
3466 try:
3467 self._downloader.process_info(info)
3468 except UnavailableVideoError, err:
3469 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3470
3471
65cd34c5
RG
3472class PostProcessor(object):
3473 """Post Processor class.
3474
3475 PostProcessor objects can be added to downloaders with their
3476 add_post_processor() method. When the downloader has finished a
3477 successful download, it will take its internal chain of PostProcessors
3478 and start calling the run() method on each one of them, first with
3479 an initial argument and then with the returned value of the previous
3480 PostProcessor.
3481
3482 The chain will be stopped if one of them ever returns None or the end
3483 of the chain is reached.
3484
3485 PostProcessor objects follow a "mutual registration" process similar
3486 to InfoExtractor objects.
3487 """
3488
3489 _downloader = None
3490
3491 def __init__(self, downloader=None):
3492 self._downloader = downloader
3493
65cd34c5
RG
3494 def set_downloader(self, downloader):
3495 """Sets the downloader for this PP."""
3496 self._downloader = downloader
d3975459 3497
65cd34c5
RG
3498 def run(self, information):
3499 """Run the PostProcessor.
3500
3501 The "information" argument is a dictionary like the ones
2f11508a 3502 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
3503 one has an extra field called "filepath" that points to the
3504 downloaded file.
3505
3506 When this method returns None, the postprocessing chain is
3507 stopped. However, this method may return an information
3508 dictionary that will be passed to the next postprocessing
3509 object in the chain. It can be the one it received after
3510 changing some fields.
3511
3512 In addition, this method may raise a PostProcessingError
3513 exception that will be taken into account by the downloader
3514 it was called from.
3515 """
3516 return information # by default, do nothing
d3975459 3517
c0a10ca8 3518
3072fab1
RG
3519class FFmpegExtractAudioPP(PostProcessor):
3520
c99dcbd2 3521 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3072fab1
RG
3522 PostProcessor.__init__(self, downloader)
3523 if preferredcodec is None:
3524 preferredcodec = 'best'
3525 self._preferredcodec = preferredcodec
18b7f874 3526 self._preferredquality = preferredquality
3527 self._keepvideo = keepvideo
3072fab1
RG
3528
3529 @staticmethod
3530 def get_audio_codec(path):
da273188 3531 try:
2727dbf7
RG
3532 cmd = ['ffprobe', '-show_streams', '--', path]
3533 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
da273188
RG
3534 output = handle.communicate()[0]
3535 if handle.wait() != 0:
3536 return None
3537 except (IOError, OSError):
3072fab1
RG
3538 return None
3539 audio_codec = None
3540 for line in output.split('\n'):
3541 if line.startswith('codec_name='):
3542 audio_codec = line.split('=')[1].strip()
3543 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3544 return audio_codec
3545 return None
3546
3547 @staticmethod
3548 def run_ffmpeg(path, out_path, codec, more_opts):
3549 try:
2727dbf7
RG
3550 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3551 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3072fab1
RG
3552 return (ret == 0)
3553 except (IOError, OSError):
3554 return False
3555
3556 def run(self, information):
3557 path = information['filepath']
3558
3559 filecodec = self.get_audio_codec(path)
3560 if filecodec is None:
da273188 3561 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3072fab1
RG
3562 return None
3563
3564 more_opts = []
3565 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
efb113c7 3566 if filecodec in ['aac', 'mp3', 'vorbis']:
3072fab1
RG
3567 # Lossless if possible
3568 acodec = 'copy'
3569 extension = filecodec
3570 if filecodec == 'aac':
3571 more_opts = ['-f', 'adts']
58384838
RC
3572 if filecodec == 'vorbis':
3573 extension = 'ogg'
3072fab1
RG
3574 else:
3575 # MP3 otherwise.
3576 acodec = 'libmp3lame'
3577 extension = 'mp3'
c99dcbd2
PH
3578 more_opts = []
3579 if self._preferredquality is not None:
3580 more_opts += ['-ab', self._preferredquality]
3072fab1
RG
3581 else:
3582 # We convert the audio (lossy)
58384838 3583 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3072fab1 3584 extension = self._preferredcodec
c99dcbd2
PH
3585 more_opts = []
3586 if self._preferredquality is not None:
3587 more_opts += ['-ab', self._preferredquality]
3072fab1
RG
3588 if self._preferredcodec == 'aac':
3589 more_opts += ['-f', 'adts']
58384838
RC
3590 if self._preferredcodec == 'vorbis':
3591 extension = 'ogg'
3072fab1
RG
3592
3593 (prefix, ext) = os.path.splitext(path)
3594 new_path = prefix + '.' + extension
3595 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3596 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3597
3598 if not status:
1bd92582 3599 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3072fab1
RG
3600 return None
3601
36597dc4
K
3602 # Try to update the date time for extracted audio file.
3603 if information.get('filetime') is not None:
3604 try:
3605 os.utime(new_path, (time.time(), information['filetime']))
3606 except:
3607 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3608
18b7f874 3609 if not self._keepvideo:
3610 try:
3611 os.remove(path)
3612 except (IOError, OSError):
3613 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3614 return None
3072fab1
RG
3615
3616 information['filepath'] = new_path
3617 return information
3618
5fb3df4a
GV
3619
3620def updateSelf(downloader, filename):
3621 ''' Update the program file with the latest version from the repository '''
3622 # Note: downloader only used for options
3623 if not os.access(filename, os.W_OK):
3624 sys.exit('ERROR: no write permissions on %s' % filename)
3625
d207e7cf 3626 downloader.to_screen('Updating to latest version...')
5fb3df4a 3627
4fa74b52 3628 try:
d207e7cf
PH
3629 try:
3630 urlh = urllib.urlopen(UPDATE_URL)
3631 newcontent = urlh.read()
27365956
PH
3632
3633 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3634 if vmatch is not None and vmatch.group(1) == __version__:
3635 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3636 return
d207e7cf
PH
3637 finally:
3638 urlh.close()
5fb3df4a
GV
3639 except (IOError, OSError), err:
3640 sys.exit('ERROR: unable to download latest version')
f9f1e798 3641
5fb3df4a 3642 try:
d207e7cf
PH
3643 outf = open(filename, 'wb')
3644 try:
3645 outf.write(newcontent)
3646 finally:
3647 outf.close()
5fb3df4a
GV
3648 except (IOError, OSError), err:
3649 sys.exit('ERROR: unable to overwrite current version')
4bec29ef 3650
eb6c37da 3651 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
80066952 3652
4f9f96f6
GV
3653def parseOpts():
3654 # Deferred imports
3655 import getpass
3656 import optparse
e7cf18cb 3657
4f9f96f6
GV
3658 def _format_option_string(option):
3659 ''' ('-o', '--option') -> -o, --format METAVAR'''
80066952 3660
4f9f96f6
GV
3661 opts = []
3662
3663 if option._short_opts: opts.append(option._short_opts[0])
3664 if option._long_opts: opts.append(option._long_opts[0])
3665 if len(opts) > 1: opts.insert(1, ', ')
3666
3667 if option.takes_value(): opts.append(' %s' % option.metavar)
3668
3669 return "".join(opts)
3670
6a4f0a11
GV
3671 def _find_term_columns():
3672 columns = os.environ.get('COLUMNS', None)
2c8d32de
PH
3673 if columns:
3674 return int(columns)
3675
4f2a5e06
PH
3676 try:
3677 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3678 out,err = sp.communicate()
eb0387a8 3679 return int(out.split()[1])
4f2a5e06
PH
3680 except:
3681 pass
2c8d32de 3682 return None
6a4f0a11 3683
51c8e53f
GV
3684 max_width = 80
3685 max_help_position = 80
3686
3687 # No need to wrap help messages if we're on a wide console
6a4f0a11 3688 columns = _find_term_columns()
51c8e53f
GV
3689 if columns: max_width = columns
3690
3691 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4f9f96f6
GV
3692 fmt.format_option_strings = _format_option_string
3693
3694 kw = {
3695 'version' : __version__,
3696 'formatter' : fmt,
a2f7e3a5 3697 'usage' : '%prog [options] url [url...]',
4f9f96f6
GV
3698 'conflict_handler' : 'resolve',
3699 }
3700
3701 parser = optparse.OptionParser(**kw)
3702
3703 # option groups
3704 general = optparse.OptionGroup(parser, 'General Options')
20e91e83 3705 selection = optparse.OptionGroup(parser, 'Video Selection')
4f9f96f6
GV
3706 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3707 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3708 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3709 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3710 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3711
3712 general.add_option('-h', '--help',
3713 action='help', help='print this help text and exit')
3714 general.add_option('-v', '--version',
3715 action='version', help='print program version and exit')
3716 general.add_option('-U', '--update',
e0e56865 3717 action='store_true', dest='update_self', help='update this program to latest version')
4f9f96f6
GV
3718 general.add_option('-i', '--ignore-errors',
3719 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3720 general.add_option('-r', '--rate-limit',
3721 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3722 general.add_option('-R', '--retries',
3723 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4f9f96f6
GV
3724 general.add_option('--dump-user-agent',
3725 action='store_true', dest='dump_user_agent',
3726 help='display the current browser identification', default=False)
f3098c4d
PH
3727 general.add_option('--list-extractors',
3728 action='store_true', dest='list_extractors',
3729 help='List all supported extractors and the URLs they would handle', default=False)
4f9f96f6 3730
20e91e83
ABP
3731 selection.add_option('--playlist-start',
3732 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3733 selection.add_option('--playlist-end',
3734 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3735 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3736 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3737
4f9f96f6
GV
3738 authentication.add_option('-u', '--username',
3739 dest='username', metavar='USERNAME', help='account username')
3740 authentication.add_option('-p', '--password',
3741 dest='password', metavar='PASSWORD', help='account password')
3742 authentication.add_option('-n', '--netrc',
3743 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3744
3745
3746 video_format.add_option('-f', '--format',
3747 action='store', dest='format', metavar='FORMAT', help='video format code')
3748 video_format.add_option('--all-formats',
5260e68f 3749 action='store_const', dest='format', help='download all available video formats', const='all')
4f9f96f6
GV
3750 video_format.add_option('--max-quality',
3751 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2761012f
PH
3752 video_format.add_option('-F', '--list-formats',
3753 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4f9f96f6
GV
3754
3755
3756 verbosity.add_option('-q', '--quiet',
3757 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3758 verbosity.add_option('-s', '--simulate',
9b4556c4
PH
3759 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3760 verbosity.add_option('--skip-download',
3761 action='store_true', dest='skip_download', help='do not download the video', default=False)
4f9f96f6
GV
3762 verbosity.add_option('-g', '--get-url',
3763 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3764 verbosity.add_option('-e', '--get-title',
3765 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3766 verbosity.add_option('--get-thumbnail',
3767 action='store_true', dest='getthumbnail',
3768 help='simulate, quiet but print thumbnail URL', default=False)
3769 verbosity.add_option('--get-description',
3770 action='store_true', dest='getdescription',
3771 help='simulate, quiet but print video description', default=False)
3772 verbosity.add_option('--get-filename',
3773 action='store_true', dest='getfilename',
3774 help='simulate, quiet but print output filename', default=False)
da0db53a
DH
3775 verbosity.add_option('--get-format',
3776 action='store_true', dest='getformat',
3777 help='simulate, quiet but print output format', default=False)
4f9f96f6
GV
3778 verbosity.add_option('--no-progress',
3779 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3780 verbosity.add_option('--console-title',
3781 action='store_true', dest='consoletitle',
3782 help='display progress in console titlebar', default=False)
3783
3784
3785 filesystem.add_option('-t', '--title',
3786 action='store_true', dest='usetitle', help='use title in file name', default=False)
3787 filesystem.add_option('-l', '--literal',
3788 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3789 filesystem.add_option('-A', '--auto-number',
3790 action='store_true', dest='autonumber',
3791 help='number downloaded files starting from 00000', default=False)
3792 filesystem.add_option('-o', '--output',
6bde5972 3793 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
4f9f96f6
GV
3794 filesystem.add_option('-a', '--batch-file',
3795 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3796 filesystem.add_option('-w', '--no-overwrites',
3797 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3798 filesystem.add_option('-c', '--continue',
c25303c3 3799 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
18bb3d1e
PH
3800 filesystem.add_option('--no-continue',
3801 action='store_false', dest='continue_dl',
3802 help='do not resume partially downloaded files (restart from beginning)')
4f9f96f6 3803 filesystem.add_option('--cookies',
abb870d1 3804 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4f9f96f6
GV
3805 filesystem.add_option('--no-part',
3806 action='store_true', dest='nopart', help='do not use .part files', default=False)
3807 filesystem.add_option('--no-mtime',
3808 action='store_false', dest='updatetime',
3809 help='do not use the Last-modified header to set the file modification time', default=True)
2c8d32de
PH
3810 filesystem.add_option('--write-description',
3811 action='store_true', dest='writedescription',
3812 help='write video description to a .description file', default=False)
3813 filesystem.add_option('--write-info-json',
3814 action='store_true', dest='writeinfojson',
3815 help='write video metadata to a .info.json file', default=False)
4f9f96f6
GV
3816
3817
3818 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3819 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3820 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
58384838 3821 help='"best", "aac", "vorbis" or "mp3"; best by default')
c99dcbd2
PH
3822 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3823 help='ffmpeg audio bitrate specification, 128k by default')
3824 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3825 help='keeps the video file on disk after the post-processing; the video is erased by default')
4f9f96f6
GV
3826
3827
3828 parser.add_option_group(general)
20e91e83 3829 parser.add_option_group(selection)
4f9f96f6
GV
3830 parser.add_option_group(filesystem)
3831 parser.add_option_group(verbosity)
3832 parser.add_option_group(video_format)
3833 parser.add_option_group(authentication)
3834 parser.add_option_group(postproc)
3835
3836 opts, args = parser.parse_args()
3837
3838 return parser, opts, args
3839
f3098c4d
PH
3840def gen_extractors():
3841 """ Return a list of an instance of every supported extractor.
3842 The order does matter; the first extractor matched is the one handling the URL.
3843 """
3844 youtube_ie = YoutubeIE()
3845 google_ie = GoogleIE()
3846 yahoo_ie = YahooIE()
3847 return [
f3098c4d
PH
3848 YoutubePlaylistIE(youtube_ie),
3849 YoutubeUserIE(youtube_ie),
3850 YoutubeSearchIE(youtube_ie),
1cde6f1d
PH
3851 youtube_ie,
3852 MetacafeIE(youtube_ie),
3853 DailymotionIE(),
f3098c4d
PH
3854 google_ie,
3855 GoogleSearchIE(google_ie),
3856 PhotobucketIE(),
3857 yahoo_ie,
3858 YahooSearchIE(yahoo_ie),
3859 DepositFilesIE(),
3860 FacebookIE(),
3861 BlipTVIE(),
3862 VimeoIE(),
3863 MyVideoIE(),
3864 ComedyCentralIE(),
3865 EscapistIE(),
8d89fbae 3866 CollegeHumorIE(),
6501a06d 3867 XVideosIE(),
f3098c4d
PH
3868
3869 GenericIE()
3870 ]
3871
5adcaa43
GV
3872def main():
3873 parser, opts, args = parseOpts()
4f9f96f6 3874
5adcaa43
GV
3875 # Open appropriate CookieJar
3876 if opts.cookiefile is None:
3877 jar = cookielib.CookieJar()
3878 else:
8cc44341 3879 try:
5adcaa43
GV
3880 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3881 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3882 jar.load()
3883 except (IOError, OSError), err:
3884 sys.exit(u'ERROR: unable to open cookie file')
80066952 3885
5adcaa43
GV
3886 # Dump user agent
3887 if opts.dump_user_agent:
3888 print std_headers['User-Agent']
3889 sys.exit(0)
e7cf18cb 3890
5adcaa43
GV
3891 # Batch file verification
3892 batchurls = []
3893 if opts.batchfile is not None:
8cc44341 3894 try:
5adcaa43
GV
3895 if opts.batchfile == '-':
3896 batchfd = sys.stdin
4bec29ef 3897 else:
5adcaa43
GV
3898 batchfd = open(opts.batchfile, 'r')
3899 batchurls = batchfd.readlines()
3900 batchurls = [x.strip() for x in batchurls]
3901 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3902 except IOError:
3903 sys.exit(u'ERROR: batch file could not be read')
3904 all_urls = batchurls + args
3905
f3098c4d
PH
3906 # General configuration
3907 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3908 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3909 urllib2.install_opener(opener)
3910 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3911
3912 extractors = gen_extractors()
3913
3914 if opts.list_extractors:
3915 for ie in extractors:
3916 print(ie.IE_NAME)
3917 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3918 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3919 for mu in matchedUrls:
3920 print(u' ' + mu)
3921 sys.exit(0)
3922
5adcaa43
GV
3923 # Conflicting, missing and erroneous options
3924 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3925 parser.error(u'using .netrc conflicts with giving username/password')
3926 if opts.password is not None and opts.username is None:
3927 parser.error(u'account username missing')
3928 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3929 parser.error(u'using output template conflicts with using title, literal title or auto number')
3930 if opts.usetitle and opts.useliteral:
3931 parser.error(u'using title conflicts with using literal title')
3932 if opts.username is not None and opts.password is None:
3933 opts.password = getpass.getpass(u'Type account password and press return:')
3934 if opts.ratelimit is not None:
3935 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3936 if numeric_limit is None:
3937 parser.error(u'invalid rate limit specified')
3938 opts.ratelimit = numeric_limit
3939 if opts.retries is not None:
8cc44341 3940 try:
5adcaa43 3941 opts.retries = long(opts.retries)
8cc44341 3942 except (TypeError, ValueError), err:
5adcaa43
GV
3943 parser.error(u'invalid retry count specified')
3944 try:
2c8d32de 3945 opts.playliststart = int(opts.playliststart)
5adcaa43 3946 if opts.playliststart <= 0:
2c8d32de 3947 raise ValueError(u'Playlist start must be positive')
5adcaa43
GV
3948 except (TypeError, ValueError), err:
3949 parser.error(u'invalid playlist start number specified')
3950 try:
2c8d32de 3951 opts.playlistend = int(opts.playlistend)
5adcaa43 3952 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2c8d32de 3953 raise ValueError(u'Playlist end must be greater than playlist start')
5adcaa43
GV
3954 except (TypeError, ValueError), err:
3955 parser.error(u'invalid playlist end number specified')
3956 if opts.extractaudio:
58384838 3957 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
5adcaa43
GV
3958 parser.error(u'invalid audio format specified')
3959
5adcaa43
GV
3960 # File downloader
3961 fd = FileDownloader({
3962 'usenetrc': opts.usenetrc,
3963 'username': opts.username,
3964 'password': opts.password,
da0db53a 3965 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
5adcaa43
GV
3966 'forceurl': opts.geturl,
3967 'forcetitle': opts.gettitle,
3968 'forcethumbnail': opts.getthumbnail,
3969 'forcedescription': opts.getdescription,
3970 'forcefilename': opts.getfilename,
da0db53a 3971 'forceformat': opts.getformat,
9b4556c4 3972 'simulate': opts.simulate,
da0db53a 3973 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
5adcaa43
GV
3974 'format': opts.format,
3975 'format_limit': opts.format_limit,
3de2a1e6 3976 'listformats': opts.listformats,
5adcaa43
GV
3977 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3978 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3979 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3980 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3981 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3982 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3983 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3984 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3985 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3986 or u'%(id)s.%(ext)s'),
3987 'ignoreerrors': opts.ignoreerrors,
3988 'ratelimit': opts.ratelimit,
3989 'nooverwrites': opts.nooverwrites,
3990 'retries': opts.retries,
3991 'continuedl': opts.continue_dl,
3992 'noprogress': opts.noprogress,
3993 'playliststart': opts.playliststart,
3994 'playlistend': opts.playlistend,
3995 'logtostderr': opts.outtmpl == '-',
3996 'consoletitle': opts.consoletitle,
3997 'nopart': opts.nopart,
3998 'updatetime': opts.updatetime,
2c8d32de
PH
3999 'writedescription': opts.writedescription,
4000 'writeinfojson': opts.writeinfojson,
20e91e83
ABP
4001 'matchtitle': opts.matchtitle,
4002 'rejecttitle': opts.rejecttitle,
5adcaa43 4003 })
8c5dc3ad
PH
4004 for extractor in extractors:
4005 fd.add_info_extractor(extractor)
5adcaa43
GV
4006
4007 # PostProcessors
4008 if opts.extractaudio:
c99dcbd2 4009 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
5adcaa43
GV
4010
4011 # Update version
4012 if opts.update_self:
4013 updateSelf(fd, sys.argv[0])
4014
4015 # Maybe do nothing
4016 if len(all_urls) < 1:
4017 if not opts.update_self:
4018 parser.error(u'you must provide at least one URL')
4019 else:
4020 sys.exit()
4021 retcode = fd.download(all_urls)
80066952 4022
5adcaa43
GV
4023 # Dump cookie jar if requested
4024 if opts.cookiefile is not None:
4025 try:
4026 jar.save()
4027 except (IOError, OSError), err:
4028 sys.exit(u'ERROR: unable to save cookie jar')
80066952 4029
5adcaa43 4030 sys.exit(retcode)
80066952 4031
4fa74b52 4032
5adcaa43
GV
4033if __name__ == '__main__':
4034 try:
4035 main()
e5bf0f55
RG
4036 except DownloadError:
4037 sys.exit(1)
4038 except SameFileError:
76a7f364 4039 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 4040 except KeyboardInterrupt:
76a7f364 4041 sys.exit(u'\nERROR: Interrupted by user')
e9cb9c28
GV
4042
4043# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: