]> jfr.im git - yt-dlp.git/blame - youtube-dl
Simplify simplify_title
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
2770590d
GV
3
4__author__ = (
2c8d32de
PH
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
6ae796b1 12 'Rogério Brito',
eb11aacc 13 'Philipp Hagemeister',
6fc5b0bb 14 'Sören Schulze',
aab771fb
PH
15 'Kevin Ngo',
16 'Ori Avtalion',
2770590d
GV
17 )
18
2c8d32de 19__license__ = 'Public Domain'
45aa6908 20__version__ = '2011.10.19'
2770590d 21
8236e851 22UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
d207e7cf 23
80066952 24import cookielib
a1f03c7b 25import datetime
1987c232 26import gzip
4fa74b52 27import htmlentitydefs
f9c68787 28import HTMLParser
4fa74b52 29import httplib
2546e767 30import locale
4fa74b52
RG
31import math
32import netrc
33import os
34import os.path
35import re
36import socket
37import string
0487b407 38import subprocess
4fa74b52
RG
39import sys
40import time
41import urllib
42import urllib2
c6b55a8d 43import warnings
1987c232 44import zlib
a04e80a4 45
0a3c8b62
PH
46if os.name == 'nt':
47 import ctypes
48
49try:
50 import email.utils
51except ImportError: # Python 2.4
52 import email.Utils
c6b55a8d
PH
53try:
54 import cStringIO as StringIO
55except ImportError:
56 import StringIO
57
a04e80a4
RG
58# parse_qs was moved from the cgi module to the urlparse module recently.
59try:
60 from urlparse import parse_qs
61except ImportError:
62 from cgi import parse_qs
4fa74b52 63
c6b55a8d
PH
64try:
65 import lxml.etree
2b70537d 66except ImportError:
c6b55a8d
PH
67 pass # Handled below
68
c8e30044
PH
69try:
70 import xml.etree.ElementTree
afb5b55d
PH
71except ImportError: # Python<2.5: Not officially supported, but let it slip
72 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
c8e30044 73
f995f712 74std_headers = {
c44b9ee9 75 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
4fa74b52 76 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
96942e62 77 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
a57ed21f 78 'Accept-Encoding': 'gzip, deflate',
4fa74b52
RG
79 'Accept-Language': 'en-us,en;q=0.5',
80}
81
437d76c1
PH
82try:
83 import json
91e6a385 84except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
437d76c1
PH
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
eae2666c
RG
195def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
f94b636c
RG
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
eae2666c 210
c0a10ca8 211
490fd7ae
RG
212def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
d3975459 214
490fd7ae
RG
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
c0a10ca8 238
490fd7ae 239def sanitize_title(utitle):
31bcb480 240 """Sanitizes a video title so it could be used as part of a filename."""
490fd7ae 241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
490fd7ae
RG
242 return utitle.replace(unicode(os.sep), u'%')
243
c0a10ca8 244
31bcb480
RG
245def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
131bc765 256 if filename == u'-':
e08878f4
RG
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131bc765 260 return (sys.stdout, filename)
31bcb480
RG
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
ca6a11fa 265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
31bcb480
RG
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
c0a10ca8 271
09bd408c 272def timeconvert(timestr):
c0a10ca8
F
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
e33e3045 280def _simplify_title(title):
e092418d 281 return re.sub(ur'[^\w\d_\-]+', u'_', title).strip(u'_')
09bd408c 282
e5bf0f55
RG
283class DownloadError(Exception):
284 """Download Error exception.
d3975459 285
e5bf0f55
RG
286 This exception may be thrown by FileDownloader objects if they are not
287 configured to continue on errors. They will contain the appropriate
288 error message.
289 """
290 pass
291
c0a10ca8 292
e5bf0f55
RG
293class SameFileError(Exception):
294 """Same File exception.
295
296 This exception will be thrown by FileDownloader objects if they detect
297 multiple files would have to be downloaded to the same file on disk.
298 """
299 pass
300
c0a10ca8 301
65cd34c5
RG
302class PostProcessingError(Exception):
303 """Post Processing exception.
304
305 This exception may be raised by PostProcessor's .run() method to
306 indicate an error in the postprocessing task.
307 """
308 pass
309
c0a10ca8 310
73f4e7af 311class UnavailableVideoError(Exception):
7b7759f5 312 """Unavailable Format exception.
313
314 This exception will be thrown when a video is requested
315 in a format that is not available for that video.
316 """
d69a1c91
RG
317 pass
318
c0a10ca8 319
d69a1c91
RG
320class ContentTooShortError(Exception):
321 """Content Too Short exception.
322
323 This exception may be raised by FileDownloader objects when a file they
324 download is too small for what the server announced first, indicating
325 the connection was probably interrupted.
326 """
327 # Both in bytes
328 downloaded = None
329 expected = None
330
331 def __init__(self, downloaded, expected):
332 self.downloaded = downloaded
333 self.expected = expected
7b7759f5 334
c0a10ca8 335
1987c232
RG
336class YoutubeDLHandler(urllib2.HTTPHandler):
337 """Handler for HTTP requests and responses.
338
339 This class, when installed with an OpenerDirector, automatically adds
340 the standard headers to every HTTP request and handles gzipped and
341 deflated responses from web servers. If compression is to be avoided in
342 a particular request, the original request in the program code only has
343 to include the HTTP header "Youtubedl-No-Compression", which will be
344 removed before making the real request.
c0a10ca8 345
1987c232
RG
346 Part of this code was copied from:
347
c0a10ca8
F
348 http://techknack.net/python-urllib2-handlers/
349
1987c232
RG
350 Andrew Rowls, the author of that code, agreed to release it to the
351 public domain.
352 """
353
354 @staticmethod
355 def deflate(data):
356 try:
357 return zlib.decompress(data, -zlib.MAX_WBITS)
358 except zlib.error:
359 return zlib.decompress(data)
c0a10ca8 360
7b531c0b
RG
361 @staticmethod
362 def addinfourl_wrapper(stream, headers, url, code):
363 if hasattr(urllib2.addinfourl, 'getcode'):
364 return urllib2.addinfourl(stream, headers, url, code)
0f6b00b5
RG
365 ret = urllib2.addinfourl(stream, headers, url)
366 ret.code = code
367 return ret
c0a10ca8 368
1987c232
RG
369 def http_request(self, req):
370 for h in std_headers:
371 if h in req.headers:
372 del req.headers[h]
373 req.add_header(h, std_headers[h])
374 if 'Youtubedl-no-compression' in req.headers:
375 if 'Accept-encoding' in req.headers:
376 del req.headers['Accept-encoding']
377 del req.headers['Youtubedl-no-compression']
378 return req
379
380 def http_response(self, req, resp):
381 old_resp = resp
382 # gzip
383 if resp.headers.get('Content-encoding', '') == 'gzip':
384 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
7b531c0b 385 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
386 resp.msg = old_resp.msg
387 # deflate
388 if resp.headers.get('Content-encoding', '') == 'deflate':
389 gz = StringIO.StringIO(self.deflate(resp.read()))
7b531c0b 390 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1987c232
RG
391 resp.msg = old_resp.msg
392 return resp
393
c0a10ca8 394
4fa74b52
RG
395class FileDownloader(object):
396 """File Downloader class.
397
398 File downloader objects are the ones responsible of downloading the
399 actual video file and writing it to disk if the user has requested
400 it, among some other tasks. In most cases there should be one per
401 program. As, given a video URL, the downloader doesn't know how to
402 extract all the needed information, task that InfoExtractors do, it
403 has to pass the URL to one of them.
404
405 For this, file downloader objects have a method that allows
406 InfoExtractors to be registered in a given order. When it is passed
407 a URL, the file downloader handles it to the first InfoExtractor it
2851b2ca
RG
408 finds that reports being able to handle it. The InfoExtractor extracts
409 all the information about the video or videos the URL refers to, and
410 asks the FileDownloader to process the video information, possibly
411 downloading the video.
4fa74b52
RG
412
413 File downloaders accept a lot of parameters. In order not to saturate
414 the object constructor with arguments, it receives a dictionary of
d0a9affb
RG
415 options instead. These options are available through the params
416 attribute for the InfoExtractors to use. The FileDownloader also
417 registers itself as the downloader in charge for the InfoExtractors
418 that are added to it, so this is a "mutual registration".
4fa74b52
RG
419
420 Available options:
421
80066952
RG
422 username: Username for authentication purposes.
423 password: Password for authentication purposes.
424 usenetrc: Use netrc for authentication instead.
425 quiet: Do not print messages to stdout.
426 forceurl: Force printing final URL.
427 forcetitle: Force printing title.
428 forcethumbnail: Force printing thumbnail URL.
429 forcedescription: Force printing description.
9f796346 430 forcefilename: Force printing final filename.
80066952
RG
431 simulate: Do not download the video files.
432 format: Video format code.
433 format_limit: Highest quality format to try.
434 outtmpl: Template for output names.
435 ignoreerrors: Do not stop on download errors.
436 ratelimit: Download speed limit, in bytes/sec.
437 nooverwrites: Prevent overwriting files.
438 retries: Number of times to retry for HTTP error 5xx
439 continuedl: Try to continue downloads if possible.
440 noprogress: Do not print the progress bar.
441 playliststart: Playlist item to start at.
8cc44341 442 playlistend: Playlist item to end at.
20e91e83
ABP
443 matchtitle: Download only matching titles.
444 rejecttitle: Reject downloads for matching titles.
331ce0a0 445 logtostderr: Log messages to stderr instead of stdout.
ccbd296b 446 consoletitle: Display progress in console window's titlebar.
3fb2c487 447 nopart: Do not use temporary .part files.
e3018902 448 updatetime: Use the Last-modified header to set output file timestamps.
8b95c387 449 writedescription: Write the video description to a .description file
6eb08fbf 450 writeinfojson: Write the video description to a .info.json file
4fa74b52
RG
451 """
452
d0a9affb 453 params = None
4fa74b52 454 _ies = []
65cd34c5 455 _pps = []
9bf386d7 456 _download_retcode = None
7d8d0612 457 _num_downloads = None
331ce0a0 458 _screen_file = None
4fa74b52
RG
459
460 def __init__(self, params):
1c5e2302 461 """Create a FileDownloader object with the given options."""
4fa74b52 462 self._ies = []
65cd34c5 463 self._pps = []
9bf386d7 464 self._download_retcode = 0
7d8d0612 465 self._num_downloads = 0
331ce0a0 466 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
d0a9affb 467 self.params = params
d3975459 468
4fa74b52
RG
469 @staticmethod
470 def format_bytes(bytes):
471 if bytes is None:
472 return 'N/A'
8497c36d
RG
473 if type(bytes) is str:
474 bytes = float(bytes)
475 if bytes == 0.0:
4fa74b52
RG
476 exponent = 0
477 else:
8497c36d 478 exponent = long(math.log(bytes, 1024.0))
4fa74b52 479 suffix = 'bkMGTPEZY'[exponent]
c0a10ca8 480 converted = float(bytes) / float(1024 ** exponent)
4fa74b52
RG
481 return '%.2f%s' % (converted, suffix)
482
483 @staticmethod
484 def calc_percent(byte_counter, data_len):
485 if data_len is None:
486 return '---.-%'
487 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488
489 @staticmethod
490 def calc_eta(start, now, total, current):
491 if total is None:
492 return '--:--'
493 dif = now - start
494 if current == 0 or dif < 0.001: # One millisecond
495 return '--:--'
496 rate = float(current) / dif
497 eta = long((float(total) - float(current)) / rate)
498 (eta_mins, eta_secs) = divmod(eta, 60)
499 if eta_mins > 99:
500 return '--:--'
501 return '%02d:%02d' % (eta_mins, eta_secs)
502
5121ef20 503 @staticmethod
4fa74b52
RG
504 def calc_speed(start, now, bytes):
505 dif = now - start
506 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 507 return '%10s' % '---b/s'
4fa74b52
RG
508 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509
510 @staticmethod
511 def best_block_size(elapsed_time, bytes):
512 new_min = max(bytes / 2.0, 1.0)
513 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
514 if elapsed_time < 0.001:
e1f18b8a 515 return long(new_max)
4fa74b52
RG
516 rate = bytes / elapsed_time
517 if rate > new_max:
e1f18b8a 518 return long(new_max)
4fa74b52 519 if rate < new_min:
e1f18b8a
RG
520 return long(new_min)
521 return long(rate)
4fa74b52 522
acd3d842
RG
523 @staticmethod
524 def parse_bytes(bytestr):
525 """Parse a string indicating a byte quantity into a long integer."""
526 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 if matchobj is None:
528 return None
529 number = float(matchobj.group(1))
530 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
531 return long(round(number * multiplier))
532
4fa74b52
RG
533 def add_info_extractor(self, ie):
534 """Add an InfoExtractor object to the end of the list."""
535 self._ies.append(ie)
536 ie.set_downloader(self)
d3975459 537
65cd34c5
RG
538 def add_post_processor(self, pp):
539 """Add a PostProcessor object to the end of the chain."""
540 self._pps.append(pp)
541 pp.set_downloader(self)
d3975459 542
331ce0a0 543 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
9fcd8355 544 """Print message to stdout if not in quiet mode."""
43ab0ca4
RG
545 try:
546 if not self.params.get('quiet', False):
331ce0a0
RG
547 terminator = [u'\n', u''][skip_eol]
548 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
549 self._screen_file.flush()
43ab0ca4
RG
550 except (UnicodeEncodeError), err:
551 if not ignore_encoding_errors:
552 raise
d3975459 553
7e5cab67
RG
554 def to_stderr(self, message):
555 """Print message to stderr."""
eae2666c 556 print >>sys.stderr, message.encode(preferredencoding())
d3975459 557
ccbd296b
MM
558 def to_cons_title(self, message):
559 """Set console/terminal window title to message."""
560 if not self.params.get('consoletitle', False):
561 return
562 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
563 # c_wchar_p() might not be necessary if `message` is
564 # already of type unicode()
565 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
566 elif 'TERM' in os.environ:
567 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
568
22899cea
RG
569 def fixed_template(self):
570 """Checks if the output template is fixed."""
d0a9affb 571 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
9fcd8355 572
0086d1ec
RG
573 def trouble(self, message=None):
574 """Determine action to take when a download problem appears.
575
576 Depending on if the downloader has been configured to ignore
e5bf0f55 577 download errors or not, this method may throw an exception or
9bf386d7 578 not when errors are found, after printing the message.
0086d1ec
RG
579 """
580 if message is not None:
581 self.to_stderr(message)
d0a9affb 582 if not self.params.get('ignoreerrors', False):
e5bf0f55 583 raise DownloadError(message)
9bf386d7 584 self._download_retcode = 1
0086d1ec 585
acd3d842
RG
586 def slow_down(self, start_time, byte_counter):
587 """Sleep if the download speed is over the rate limit."""
d0a9affb 588 rate_limit = self.params.get('ratelimit', None)
acd3d842
RG
589 if rate_limit is None or byte_counter == 0:
590 return
591 now = time.time()
592 elapsed = now - start_time
593 if elapsed <= 0.0:
594 return
595 speed = float(byte_counter) / elapsed
596 if speed > rate_limit:
597 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
3fb2c487
RG
598
599 def temp_name(self, filename):
600 """Returns a temporary filename for the given filename."""
601 if self.params.get('nopart', False) or filename == u'-' or \
602 (os.path.exists(filename) and not os.path.isfile(filename)):
603 return filename
604 return filename + u'.part'
605
8cc42e7c
RG
606 def undo_temp_name(self, filename):
607 if filename.endswith(u'.part'):
608 return filename[:-len(u'.part')]
609 return filename
610
62cf7aaf
RG
611 def try_rename(self, old_filename, new_filename):
612 try:
7d950ca1
RG
613 if old_filename == new_filename:
614 return
62cf7aaf
RG
615 os.rename(old_filename, new_filename)
616 except (IOError, OSError), err:
617 self.trouble(u'ERROR: unable to rename file')
c0a10ca8 618
e3018902
RG
619 def try_utime(self, filename, last_modified_hdr):
620 """Try to set the last-modified time of the given file."""
621 if last_modified_hdr is None:
622 return
623 if not os.path.isfile(filename):
624 return
625 timestr = last_modified_hdr
626 if timestr is None:
627 return
628 filetime = timeconvert(timestr)
629 if filetime is None:
36597dc4 630 return filetime
e3018902 631 try:
c0a10ca8 632 os.utime(filename, (time.time(), filetime))
e3018902
RG
633 except:
634 pass
36597dc4 635 return filetime
acd3d842 636
8b95c387 637 def report_writedescription(self, descfn):
6eb08fbf
PH
638 """ Report that the description file is being written """
639 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
640
641 def report_writeinfojson(self, infofn):
642 """ Report that the metadata file has been written """
643 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
8b95c387 644
bafa5cd9
RG
645 def report_destination(self, filename):
646 """Report destination filename."""
331ce0a0 647 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
d3975459 648
bafa5cd9
RG
649 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
650 """Report download progress."""
d9835247
RG
651 if self.params.get('noprogress', False):
652 return
331ce0a0 653 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
bafa5cd9 654 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
ccbd296b
MM
655 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
656 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
7db85b2c
RG
657
658 def report_resuming_byte(self, resume_len):
8a9f53be 659 """Report attempt to resume at given byte."""
331ce0a0 660 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
d3975459 661
7031008c 662 def report_retry(self, count, retries):
e86e9474 663 """Report retry in case of HTTP error 5xx"""
331ce0a0 664 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
d3975459 665
7db85b2c
RG
666 def report_file_already_downloaded(self, file_name):
667 """Report file has already been fully downloaded."""
43ab0ca4 668 try:
331ce0a0 669 self.to_screen(u'[download] %s has already been downloaded' % file_name)
43ab0ca4 670 except (UnicodeEncodeError), err:
331ce0a0 671 self.to_screen(u'[download] The file has already been downloaded')
d3975459 672
7db85b2c
RG
673 def report_unable_to_resume(self):
674 """Report it was impossible to resume download."""
331ce0a0 675 self.to_screen(u'[download] Unable to resume')
d3975459 676
bafa5cd9
RG
677 def report_finish(self):
678 """Report download finished."""
d9835247 679 if self.params.get('noprogress', False):
331ce0a0 680 self.to_screen(u'[download] Download completed')
d9835247 681 else:
331ce0a0 682 self.to_screen(u'')
d3975459 683
df372a65
RG
684 def increment_downloads(self):
685 """Increment the ordinal that assigns a number to each file."""
686 self._num_downloads += 1
bafa5cd9 687
9f796346
GI
688 def prepare_filename(self, info_dict):
689 """Generate the output filename."""
690 try:
691 template_dict = dict(info_dict)
692 template_dict['epoch'] = unicode(long(time.time()))
693 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
694 filename = self.params['outtmpl'] % template_dict
695 return filename
696 except (ValueError, KeyError), err:
697 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 return None
699
c8619e01
RG
700 def process_info(self, info_dict):
701 """Process a single dictionary returned by an InfoExtractor."""
9f796346 702 filename = self.prepare_filename(info_dict)
9b4556c4
PH
703
704 # Forced printings
705 if self.params.get('forcetitle', False):
706 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forceurl', False):
708 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
710 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcedescription', False) and 'description' in info_dict:
712 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forcefilename', False) and filename is not None:
714 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
da0db53a
DH
715 if self.params.get('forceformat', False):
716 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
9b4556c4 717
c8619e01
RG
718 # Do nothing else if in simulate mode
719 if self.params.get('simulate', False):
9bf386d7 720 return
d3975459 721
9f796346 722 if filename is None:
38ed1344 723 return
20e91e83
ABP
724
725 matchtitle=self.params.get('matchtitle',False)
726 rejecttitle=self.params.get('rejecttitle',False)
727 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
728 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
729 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
730 return
731 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
732 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 return
734
850ab765 735 if self.params.get('nooverwrites', False) and os.path.exists(filename):
5c44af18 736 self.to_stderr(u'WARNING: file exists and will be skipped')
9bf386d7 737 return
7b7759f5 738
c8619e01 739 try:
e5e74ffb
PH
740 dn = os.path.dirname(filename)
741 if dn != '' and not os.path.exists(dn):
742 os.makedirs(dn)
c8619e01 743 except (OSError, IOError), err:
cec3a53c 744 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
9bf386d7 745 return
7b7759f5 746
8b95c387
PH
747 if self.params.get('writedescription', False):
748 try:
749 descfn = filename + '.description'
6eb08fbf 750 self.report_writedescription(descfn)
1293ce58
PH
751 descfile = open(descfn, 'wb')
752 try:
8b95c387 753 descfile.write(info_dict['description'].encode('utf-8'))
1293ce58
PH
754 finally:
755 descfile.close()
8b95c387 756 except (OSError, IOError):
cec3a53c 757 self.trouble(u'ERROR: Cannot write description file ' + descfn)
8b95c387
PH
758 return
759
6eb08fbf
PH
760 if self.params.get('writeinfojson', False):
761 infofn = filename + '.info.json'
762 self.report_writeinfojson(infofn)
763 try:
764 json.dump
765 except (NameError,AttributeError):
766 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 return
768 try:
1293ce58
PH
769 infof = open(infofn, 'wb')
770 try:
54f329fe
PH
771 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
772 json.dump(json_info_dict, infof)
1293ce58
PH
773 finally:
774 infof.close()
6eb08fbf 775 except (OSError, IOError):
cec3a53c 776 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
6eb08fbf
PH
777 return
778
9b4556c4 779 if not self.params.get('skip_download', False):
55e7c75e 780 try:
366cbfb0 781 success = self._do_download(filename, info_dict)
9b4556c4
PH
782 except (OSError, IOError), err:
783 raise UnavailableVideoError
784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
785 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
786 return
787 except (ContentTooShortError, ), err:
788 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
55e7c75e 789 return
9b4556c4
PH
790
791 if success:
792 try:
793 self.post_process(filename, info_dict)
794 except (PostProcessingError), err:
795 self.trouble(u'ERROR: postprocessing: %s' % str(err))
796 return
c8619e01 797
4fa74b52
RG
798 def download(self, url_list):
799 """Download a given list of URLs."""
22899cea 800 if len(url_list) > 1 and self.fixed_template():
d0a9affb 801 raise SameFileError(self.params['outtmpl'])
22899cea 802
4fa74b52
RG
803 for url in url_list:
804 suitable_found = False
805 for ie in self._ies:
c8619e01 806 # Go to next InfoExtractor if not suitable
4fa74b52
RG
807 if not ie.suitable(url):
808 continue
c8619e01 809
4fa74b52
RG
810 # Suitable InfoExtractor found
811 suitable_found = True
c8619e01 812
6f21f686
RG
813 # Extract information from URL and process it
814 ie.extract(url)
65cd34c5 815
c8619e01 816 # Suitable InfoExtractor had been found; go to next URL
4fa74b52 817 break
c8619e01 818
4fa74b52 819 if not suitable_found:
db7e31b8 820 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
bb681b88 821
9bf386d7 822 return self._download_retcode
65cd34c5
RG
823
824 def post_process(self, filename, ie_info):
825 """Run the postprocessing chain on the given file."""
826 info = dict(ie_info)
827 info['filepath'] = filename
828 for pp in self._pps:
829 info = pp.run(info)
830 if info is None:
831 break
d3975459 832
e616ec0c 833 def _download_with_rtmpdump(self, filename, url, player_url):
0487b407 834 self.report_destination(filename)
62cf7aaf 835 tmpfilename = self.temp_name(filename)
0487b407
RG
836
837 # Check for rtmpdump first
838 try:
839 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
840 except (OSError, IOError):
841 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842 return False
843
844 # Download using rtmpdump. rtmpdump returns exit code 2 when
845 # the connection was interrumpted and resuming appears to be
846 # possible. This is part of rtmpdump's normal usage, AFAIK.
b487ef08 847 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
1c1821f8
RG
848 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
849 while retval == 2 or retval == 1:
62cf7aaf 850 prevsize = os.path.getsize(tmpfilename)
331ce0a0 851 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
e616ec0c 852 time.sleep(5.0) # This seems to be needed
1c1821f8 853 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
62cf7aaf 854 cursize = os.path.getsize(tmpfilename)
e616ec0c
RG
855 if prevsize == cursize and retval == 1:
856 break
b487ef08
PH
857 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
858 if prevsize == cursize and retval == 2 and cursize > 1024:
859 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
860 retval = 0
861 break
0487b407 862 if retval == 0:
62cf7aaf
RG
863 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
864 self.try_rename(tmpfilename, filename)
0487b407
RG
865 return True
866 else:
db7e31b8 867 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
0487b407
RG
868 return False
869
366cbfb0
PH
870 def _do_download(self, filename, info_dict):
871 url = info_dict['url']
872 player_url = info_dict.get('player_url', None)
873
62cf7aaf 874 # Check file already present
3fb2c487 875 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
62cf7aaf
RG
876 self.report_file_already_downloaded(filename)
877 return True
878
0487b407
RG
879 # Attempt to download using rtmpdump
880 if url.startswith('rtmp'):
e616ec0c 881 return self._download_with_rtmpdump(filename, url, player_url)
0487b407 882
62cf7aaf 883 tmpfilename = self.temp_name(filename)
55e7c75e 884 stream = None
1987c232
RG
885
886 # Do not include the Accept-Encoding header
887 headers = {'Youtubedl-no-compression': 'True'}
888 basic_request = urllib2.Request(url, None, headers)
889 request = urllib2.Request(url, None, headers)
7db85b2c 890
9c457d2a 891 # Establish possible resume length
62cf7aaf
RG
892 if os.path.isfile(tmpfilename):
893 resume_len = os.path.getsize(tmpfilename)
55e7c75e
RG
894 else:
895 resume_len = 0
9c457d2a 896
10e7194d
MH
897 open_mode = 'wb'
898 if resume_len != 0:
899 if self.params.get('continuedl', False):
900 self.report_resuming_byte(resume_len)
901 request.add_header('Range','bytes=%d-' % resume_len)
902 open_mode = 'ab'
903 else:
904 resume_len = 0
55e7c75e 905
7031008c
RG
906 count = 0
907 retries = self.params.get('retries', 0)
101e0d1e 908 while count <= retries:
7031008c
RG
909 # Establish connection
910 try:
54f329fe
PH
911 if count == 0 and 'urlhandle' in info_dict:
912 data = info_dict['urlhandle']
7031008c
RG
913 data = urllib2.urlopen(request)
914 break
915 except (urllib2.HTTPError, ), err:
ac249f42 916 if (err.code < 500 or err.code >= 600) and err.code != 416:
101e0d1e 917 # Unexpected HTTP error
7031008c 918 raise
101e0d1e
RG
919 elif err.code == 416:
920 # Unable to resume (requested range not satisfiable)
921 try:
922 # Open the connection again without the range header
923 data = urllib2.urlopen(basic_request)
924 content_length = data.info()['Content-Length']
925 except (urllib2.HTTPError, ), err:
ac249f42 926 if err.code < 500 or err.code >= 600:
101e0d1e
RG
927 raise
928 else:
929 # Examine the reported length
268fb2bd 930 if (content_length is not None and
c0a10ca8 931 (resume_len - 100 < long(content_length) < resume_len + 100)):
268fb2bd
RG
932 # The file had already been fully downloaded.
933 # Explanation to the above condition: in issue #175 it was revealed that
934 # YouTube sometimes adds or removes a few bytes from the end of the file,
935 # changing the file size slightly and causing problems for some users. So
936 # I decided to implement a suggested change and consider the file
937 # completely downloaded if the file size differs less than 100 bytes from
938 # the one in the hard drive.
101e0d1e 939 self.report_file_already_downloaded(filename)
62cf7aaf 940 self.try_rename(tmpfilename, filename)
101e0d1e
RG
941 return True
942 else:
943 # The length does not match, we start the download over
944 self.report_unable_to_resume()
945 open_mode = 'wb'
946 break
947 # Retry
948 count += 1
949 if count <= retries:
950 self.report_retry(count, retries)
951
952 if count > retries:
953 self.trouble(u'ERROR: giving up after %s retries' % retries)
954 return False
7db85b2c 955
4fa74b52 956 data_len = data.info().get('Content-length', None)
106d091e
RG
957 if data_len is not None:
958 data_len = long(data_len) + resume_len
4fa74b52 959 data_len_str = self.format_bytes(data_len)
106d091e 960 byte_counter = 0 + resume_len
4fa74b52
RG
961 block_size = 1024
962 start = time.time()
963 while True:
bafa5cd9 964 # Download and write
4fa74b52
RG
965 before = time.time()
966 data_block = data.read(block_size)
967 after = time.time()
975a91d0 968 if len(data_block) == 0:
4fa74b52 969 break
975a91d0 970 byte_counter += len(data_block)
55e7c75e
RG
971
972 # Open file just in time
973 if stream is None:
974 try:
62cf7aaf 975 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
dbddab27 976 assert stream is not None
8cc42e7c 977 filename = self.undo_temp_name(tmpfilename)
55e7c75e
RG
978 self.report_destination(filename)
979 except (OSError, IOError), err:
db7e31b8 980 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
55e7c75e 981 return False
131efd1a
RG
982 try:
983 stream.write(data_block)
984 except (IOError, OSError), err:
d67e0974
RG
985 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
986 return False
975a91d0 987 block_size = self.best_block_size(after - before, len(data_block))
4fa74b52 988
55e7c75e 989 # Progress message
975a91d0 990 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
67035ede
PH
991 if data_len is None:
992 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
993 else:
994 percent_str = self.calc_percent(byte_counter, data_len)
995 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
996 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
55e7c75e 997
acd3d842 998 # Apply rate limit
975a91d0 999 self.slow_down(start, byte_counter - resume_len)
acd3d842 1000
dbddab27
PH
1001 if stream is None:
1002 self.trouble(u'\nERROR: Did not get any data blocks')
1003 return False
6f0ff3ba 1004 stream.close()
bafa5cd9 1005 self.report_finish()
b905e5f5 1006 if data_len is not None and byte_counter != data_len:
d69a1c91 1007 raise ContentTooShortError(byte_counter, long(data_len))
62cf7aaf 1008 self.try_rename(tmpfilename, filename)
e3018902 1009
09bd408c 1010 # Update file modification time
e3018902 1011 if self.params.get('updatetime', True):
366cbfb0 1012 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
e3018902 1013
366cbfb0 1014 return True
4fa74b52 1015
c0a10ca8 1016
4fa74b52
RG
1017class InfoExtractor(object):
1018 """Information Extractor class.
1019
1020 Information extractors are the classes that, given a URL, extract
1021 information from the video (or videos) the URL refers to. This
1022 information includes the real video URL, the video title and simplified
2851b2ca
RG
1023 title, author and others. The information is stored in a dictionary
1024 which is then passed to the FileDownloader. The FileDownloader
1025 processes this information possibly downloading the video to the file
1026 system, among other possible outcomes. The dictionaries must include
4fa74b52
RG
1027 the following fields:
1028
1029 id: Video identifier.
1030 url: Final video URL.
1031 uploader: Nickname of the video uploader.
1032 title: Literal title.
1033 stitle: Simplified title.
1034 ext: Video filename extension.
6ba562b0 1035 format: Video format.
e616ec0c 1036 player_url: SWF Player URL (may be None).
4fa74b52 1037
7e58d568
RG
1038 The following fields are optional. Their primary purpose is to allow
1039 youtube-dl to serve as the backend for a video search function, such
1040 as the one in youtube2mp3. They are only used when their respective
1041 forced printing functions are called:
1042
1043 thumbnail: Full URL to a video thumbnail image.
1044 description: One-line video description.
1045
4fa74b52 1046 Subclasses of this one should re-define the _real_initialize() and
bdb3f7a7
PH
1047 _real_extract() methods and define a _VALID_URL regexp.
1048 Probably, they should also be added to the list of extractors.
4fa74b52
RG
1049 """
1050
1051 _ready = False
1052 _downloader = None
1053
1054 def __init__(self, downloader=None):
1055 """Constructor. Receives an optional downloader."""
1056 self._ready = False
1057 self.set_downloader(downloader)
1058
bdb3f7a7 1059 def suitable(self, url):
4fa74b52 1060 """Receives a URL and returns True if suitable for this IE."""
bdb3f7a7 1061 return re.match(self._VALID_URL, url) is not None
4fa74b52
RG
1062
1063 def initialize(self):
1c5e2302 1064 """Initializes an instance (authentication, etc)."""
4fa74b52
RG
1065 if not self._ready:
1066 self._real_initialize()
1067 self._ready = True
1068
1069 def extract(self, url):
1070 """Extracts URL information and returns it in list of dicts."""
1071 self.initialize()
1072 return self._real_extract(url)
1073
1074 def set_downloader(self, downloader):
1075 """Sets the downloader for this IE."""
1076 self._downloader = downloader
d3975459 1077
4fa74b52
RG
1078 def _real_initialize(self):
1079 """Real initialization process. Redefine in subclasses."""
1080 pass
1081
1082 def _real_extract(self, url):
1083 """Real extraction process. Redefine in subclasses."""
1084 pass
1085
c0a10ca8 1086
4fa74b52
RG
1087class YoutubeIE(InfoExtractor):
1088 """Information extractor for youtube.com."""
1089
1cde6f1d 1090 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
9715661c 1091 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
7df4635f 1092 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
72ac78b8 1093 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
4fa74b52 1094 _NETRC_MACHINE = 'youtube'
497cd3e6 1095 # Listed in order of quality
767414a2 1096 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
7b7759f5 1097 _video_extensions = {
1098 '13': '3gp',
1099 '17': 'mp4',
1100 '18': 'mp4',
1101 '22': 'mp4',
d9bc015b 1102 '37': 'mp4',
9e9647d9 1103 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
0b59bf4a 1104 '43': 'webm',
7b417b38 1105 '44': 'webm',
0b59bf4a 1106 '45': 'webm',
7b7759f5 1107 }
3de2a1e6
FT
1108 _video_dimensions = {
1109 '5': '240x400',
1110 '6': '???',
1111 '13': '???',
1112 '17': '144x176',
1113 '18': '360x640',
1114 '22': '720x1280',
1115 '34': '360x640',
1116 '35': '480x854',
1117 '37': '1080x1920',
1118 '38': '3072x4096',
1119 '43': '360x640',
1120 '44': '480x854',
1121 '45': '720x1280',
1122 }
f3098c4d 1123 IE_NAME = u'youtube'
4fa74b52 1124
72ac78b8
RG
1125 def report_lang(self):
1126 """Report attempt to set language."""
331ce0a0 1127 self._downloader.to_screen(u'[youtube] Setting language')
72ac78b8 1128
bafa5cd9
RG
1129 def report_login(self):
1130 """Report attempt to log in."""
331ce0a0 1131 self._downloader.to_screen(u'[youtube] Logging in')
d3975459 1132
bafa5cd9
RG
1133 def report_age_confirmation(self):
1134 """Report attempt to confirm age."""
331ce0a0 1135 self._downloader.to_screen(u'[youtube] Confirming age')
d3975459 1136
e616ec0c
RG
1137 def report_video_webpage_download(self, video_id):
1138 """Report attempt to download video webpage."""
331ce0a0 1139 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
d3975459 1140
71b7300e
RG
1141 def report_video_info_webpage_download(self, video_id):
1142 """Report attempt to download video info webpage."""
331ce0a0 1143 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
d3975459 1144
bafa5cd9
RG
1145 def report_information_extraction(self, video_id):
1146 """Report attempt to extract video information."""
331ce0a0 1147 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
d3975459 1148
7b7759f5 1149 def report_unavailable_format(self, video_id, format):
1150 """Report extracted video URL."""
331ce0a0 1151 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
d3975459 1152
0487b407
RG
1153 def report_rtmp_download(self):
1154 """Indicate the download will use the RTMP protocol."""
331ce0a0 1155 self._downloader.to_screen(u'[youtube] RTMP download detected')
d3975459 1156
3de2a1e6
FT
1157 def _print_formats(self, formats):
1158 print 'Available formats:'
1159 for x in formats:
1160 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1161
4fa74b52
RG
1162 def _real_initialize(self):
1163 if self._downloader is None:
1164 return
1165
1166 username = None
1167 password = None
d0a9affb 1168 downloader_params = self._downloader.params
4fa74b52
RG
1169
1170 # Attempt to use provided username and password or .netrc data
1171 if downloader_params.get('username', None) is not None:
1172 username = downloader_params['username']
1173 password = downloader_params['password']
1174 elif downloader_params.get('usenetrc', False):
1175 try:
1176 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1177 if info is not None:
1178 username = info[0]
1179 password = info[2]
1180 else:
1181 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1182 except (IOError, netrc.NetrcParseError), err:
6f21f686 1183 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
4fa74b52
RG
1184 return
1185
72ac78b8 1186 # Set language
1987c232 1187 request = urllib2.Request(self._LANG_URL)
72ac78b8
RG
1188 try:
1189 self.report_lang()
1190 urllib2.urlopen(request).read()
1191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1192 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
72ac78b8
RG
1193 return
1194
cc109403
RG
1195 # No authentication to be performed
1196 if username is None:
1197 return
1198
4fa74b52 1199 # Log in
9fcd8355
RG
1200 login_form = {
1201 'current_form': 'loginForm',
4fa74b52
RG
1202 'next': '/',
1203 'action_login': 'Log In',
1204 'username': username,
9fcd8355
RG
1205 'password': password,
1206 }
1987c232 1207 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
4fa74b52 1208 try:
bafa5cd9 1209 self.report_login()
4fa74b52
RG
1210 login_results = urllib2.urlopen(request).read()
1211 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
6f21f686 1212 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
4fa74b52
RG
1213 return
1214 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
6f21f686 1215 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
4fa74b52 1216 return
d3975459 1217
4fa74b52 1218 # Confirm age
9fcd8355
RG
1219 age_form = {
1220 'next_url': '/',
1221 'action_confirm': 'Confirm',
1222 }
1987c232 1223 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
4fa74b52 1224 try:
bafa5cd9 1225 self.report_age_confirmation()
4fa74b52
RG
1226 age_results = urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1228 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
e5bf0f55 1229 return
4fa74b52
RG
1230
1231 def _real_extract(self, url):
1232 # Extract video id from URL
020f7150 1233 mobj = re.match(self._VALID_URL, url)
4fa74b52 1234 if mobj is None:
147753eb 1235 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1236 return
4fa74b52
RG
1237 video_id = mobj.group(2)
1238
497cd3e6
RG
1239 # Get video webpage
1240 self.report_video_webpage_download(video_id)
8d89fbae 1241 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
497cd3e6
RG
1242 try:
1243 video_webpage = urllib2.urlopen(request).read()
1244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1246 return
968aa884 1247
497cd3e6 1248 # Attempt to extract SWF player URL
b620a5f8 1249 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
497cd3e6 1250 if mobj is not None:
b620a5f8 1251 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
497cd3e6
RG
1252 else:
1253 player_url = None
1254
1255 # Get video info
1256 self.report_video_info_webpage_download(video_id)
1257 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1258 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c0a10ca8 1259 % (video_id, el_type))
1987c232 1260 request = urllib2.Request(video_info_url)
e616ec0c 1261 try:
497cd3e6
RG
1262 video_info_webpage = urllib2.urlopen(request).read()
1263 video_info = parse_qs(video_info_webpage)
1264 if 'token' in video_info:
1265 break
e616ec0c 1266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497cd3e6 1267 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
e616ec0c 1268 return
f95f29fd
RG
1269 if 'token' not in video_info:
1270 if 'reason' in video_info:
8e686771 1271 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
f95f29fd
RG
1272 else:
1273 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1274 return
1275
1276 # Start extracting information
497cd3e6
RG
1277 self.report_information_extraction(video_id)
1278
1279 # uploader
1280 if 'author' not in video_info:
1281 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1282 return
1283 video_uploader = urllib.unquote_plus(video_info['author'][0])
e616ec0c 1284
497cd3e6
RG
1285 # title
1286 if 'title' not in video_info:
1287 self._downloader.trouble(u'ERROR: unable to extract video title')
1288 return
1289 video_title = urllib.unquote_plus(video_info['title'][0])
1290 video_title = video_title.decode('utf-8')
1291 video_title = sanitize_title(video_title)
1292
1293 # simplified title
e092418d 1294 simple_title = _simplify_title(video_title)
497cd3e6
RG
1295
1296 # thumbnail image
1297 if 'thumbnail_url' not in video_info:
1298 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1299 video_thumbnail = ''
1300 else: # don't panic if we can't find it
1301 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1302
b3a27b52
NA
1303 # upload date
1304 upload_date = u'NA'
3efa45c3 1305 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
b3a27b52 1306 if mobj is not None:
a1f03c7b 1307 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
87cbd213 1308 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
a1f03c7b
NA
1309 for expression in format_expressions:
1310 try:
1311 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1312 except:
1313 pass
b3a27b52 1314
497cd3e6 1315 # description
c6b55a8d
PH
1316 try:
1317 lxml.etree
1318 except NameError:
1319 video_description = u'No description available.'
8b95c387 1320 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
c6b55a8d
PH
1321 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1322 if mobj is not None:
1323 video_description = mobj.group(1).decode('utf-8')
1324 else:
1325 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1326 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1327 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
91e6a385 1328 # TODO use another parser
497cd3e6 1329
5ce7d172
RG
1330 # token
1331 video_token = urllib.unquote_plus(video_info['token'][0])
1332
497cd3e6 1333 # Decide which formats to download
f83ae781 1334 req_format = self._downloader.params.get('format', None)
2e3a32e4 1335
f137bef9
PH
1336 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1337 self.report_rtmp_download()
1338 video_url_list = [(None, video_info['conn'][0])]
f137bef9 1339 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
0ac22e4f 1340 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
8519c32d 1341 url_data = [parse_qs(uds) for uds in url_data_strs]
f137bef9 1342 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
8519c32d 1343 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
2b70537d 1344
497cd3e6
RG
1345 format_limit = self._downloader.params.get('format_limit', None)
1346 if format_limit is not None and format_limit in self._available_formats:
1347 format_list = self._available_formats[self._available_formats.index(format_limit):]
e616ec0c 1348 else:
497cd3e6
RG
1349 format_list = self._available_formats
1350 existing_formats = [x for x in format_list if x in url_map]
1351 if len(existing_formats) == 0:
1352 self._downloader.trouble(u'ERROR: no known formats available for video')
968aa884 1353 return
3de2a1e6
FT
1354 if self._downloader.params.get('listformats', None):
1355 self._print_formats(existing_formats)
2761012f 1356 return
5260e68f 1357 if req_format is None or req_format == 'best':
d157d259 1358 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
c52b01f3
K
1359 elif req_format == 'worst':
1360 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
5260e68f 1361 elif req_format in ('-1', 'all'):
d157d259 1362 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
497cd3e6 1363 else:
5260e68f
PH
1364 # Specific formats. We pick the first in a slash-delimeted sequence.
1365 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1366 req_formats = req_format.split('/')
1367 video_url_list = None
1368 for rf in req_formats:
1369 if rf in url_map:
1370 video_url_list = [(rf, url_map[rf])]
1371 break
1372 if video_url_list is None:
5c132793
RG
1373 self._downloader.trouble(u'ERROR: requested format not available')
1374 return
497cd3e6 1375 else:
f3dc18d8 1376 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
497cd3e6 1377 return
7b7759f5 1378
497cd3e6
RG
1379 for format_param, video_real_url in video_url_list:
1380 # At this point we have a new video
1381 self._downloader.increment_downloads()
1382
1383 # Extension
1384 video_extension = self._video_extensions.get(format_param, 'flv')
7e58d568 1385
968aa884 1386 try:
7b7759f5 1387 # Process video information
1388 self._downloader.process_info({
1389 'id': video_id.decode('utf-8'),
1390 'url': video_real_url.decode('utf-8'),
1391 'uploader': video_uploader.decode('utf-8'),
138b11f3 1392 'upload_date': upload_date,
7b7759f5 1393 'title': video_title,
1394 'stitle': simple_title,
1395 'ext': video_extension.decode('utf-8'),
6ba562b0 1396 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
7e58d568 1397 'thumbnail': video_thumbnail.decode('utf-8'),
c6b55a8d 1398 'description': video_description,
e616ec0c 1399 'player_url': player_url,
7b7759f5 1400 })
497cd3e6 1401 except UnavailableVideoError, err:
09cc744c 1402 self._downloader.trouble(u'\nERROR: unable to download video')
42bcd27d 1403
4fa74b52 1404
020f7150
RG
1405class MetacafeIE(InfoExtractor):
1406 """Information Extractor for metacafe.com."""
1407
1408 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2546e767 1409 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
dbccb6cd 1410 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
020f7150 1411 _youtube_ie = None
f3098c4d 1412 IE_NAME = u'metacafe'
020f7150
RG
1413
1414 def __init__(self, youtube_ie, downloader=None):
1415 InfoExtractor.__init__(self, downloader)
1416 self._youtube_ie = youtube_ie
1417
020f7150
RG
1418 def report_disclaimer(self):
1419 """Report disclaimer retrieval."""
331ce0a0 1420 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
020f7150
RG
1421
1422 def report_age_confirmation(self):
1423 """Report attempt to confirm age."""
331ce0a0 1424 self._downloader.to_screen(u'[metacafe] Confirming age')
d3975459 1425
020f7150
RG
1426 def report_download_webpage(self, video_id):
1427 """Report webpage download."""
331ce0a0 1428 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
d3975459 1429
020f7150
RG
1430 def report_extraction(self, video_id):
1431 """Report information extraction."""
331ce0a0 1432 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
020f7150
RG
1433
1434 def _real_initialize(self):
1435 # Retrieve disclaimer
1987c232 1436 request = urllib2.Request(self._DISCLAIMER)
020f7150
RG
1437 try:
1438 self.report_disclaimer()
1439 disclaimer = urllib2.urlopen(request).read()
1440 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1441 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
020f7150
RG
1442 return
1443
1444 # Confirm age
1445 disclaimer_form = {
2546e767 1446 'filters': '0',
020f7150
RG
1447 'submit': "Continue - I'm over 18",
1448 }
1987c232 1449 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
020f7150
RG
1450 try:
1451 self.report_age_confirmation()
1452 disclaimer = urllib2.urlopen(request).read()
1453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1454 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
020f7150 1455 return
d3975459 1456
020f7150
RG
1457 def _real_extract(self, url):
1458 # Extract id and simplified title from URL
1459 mobj = re.match(self._VALID_URL, url)
1460 if mobj is None:
147753eb 1461 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6f21f686 1462 return
020f7150
RG
1463
1464 video_id = mobj.group(1)
1465
1466 # Check if video comes from YouTube
1467 mobj2 = re.match(r'^yt-(.*)$', video_id)
1468 if mobj2 is not None:
6f21f686
RG
1469 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1470 return
020f7150 1471
df372a65 1472 # At this point we have a new video
9bf7fa52 1473 self._downloader.increment_downloads()
df372a65 1474
020f7150 1475 simple_title = mobj.group(2).decode('utf-8')
020f7150
RG
1476
1477 # Retrieve video webpage to extract further information
1478 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1479 try:
1480 self.report_download_webpage(video_id)
1481 webpage = urllib2.urlopen(request).read()
1482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 1483 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
6f21f686 1484 return
020f7150
RG
1485
1486 # Extract URL, uploader and title from webpage
1487 self.report_extraction(video_id)
18963a36 1488 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
c6c555cf
RG
1489 if mobj is not None:
1490 mediaURL = urllib.unquote(mobj.group(1))
6b57e8c5 1491 video_extension = mediaURL[-3:]
d3975459 1492
c6c555cf
RG
1493 # Extract gdaKey if available
1494 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1495 if mobj is None:
1496 video_url = mediaURL
1497 else:
1498 gdaKey = mobj.group(1)
1499 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
109626fc 1500 else:
c6c555cf
RG
1501 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1502 if mobj is None:
1503 self._downloader.trouble(u'ERROR: unable to extract media URL')
1504 return
1505 vardict = parse_qs(mobj.group(1))
1506 if 'mediaData' not in vardict:
1507 self._downloader.trouble(u'ERROR: unable to extract media URL')
1508 return
1509 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1510 if mobj is None:
1511 self._downloader.trouble(u'ERROR: unable to extract media URL')
1512 return
6b57e8c5
RG
1513 mediaURL = mobj.group(1).replace('\\/', '/')
1514 video_extension = mediaURL[-3:]
1515 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
020f7150 1516
2546e767 1517 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
020f7150 1518 if mobj is None:
147753eb 1519 self._downloader.trouble(u'ERROR: unable to extract title')
6f21f686 1520 return
020f7150 1521 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1522 video_title = sanitize_title(video_title)
020f7150 1523
29f07568 1524 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
020f7150 1525 if mobj is None:
147753eb 1526 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
6f21f686 1527 return
dbccb6cd 1528 video_uploader = mobj.group(1)
020f7150 1529
42bcd27d 1530 try:
1531 # Process video information
1532 self._downloader.process_info({
1533 'id': video_id.decode('utf-8'),
1534 'url': video_url.decode('utf-8'),
1535 'uploader': video_uploader.decode('utf-8'),
138b11f3 1536 'upload_date': u'NA',
42bcd27d 1537 'title': video_title,
1538 'stitle': simple_title,
1539 'ext': video_extension.decode('utf-8'),
6ba562b0 1540 'format': u'NA',
e616ec0c 1541 'player_url': None,
42bcd27d 1542 })
73f4e7af 1543 except UnavailableVideoError:
09cc744c 1544 self._downloader.trouble(u'\nERROR: unable to download video')
020f7150 1545
25af2bce 1546
4135fa45
WB
1547class DailymotionIE(InfoExtractor):
1548 """Information Extractor for Dailymotion"""
1549
1550 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
f3098c4d 1551 IE_NAME = u'dailymotion'
4135fa45
WB
1552
1553 def __init__(self, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1555
4135fa45
WB
1556 def report_download_webpage(self, video_id):
1557 """Report webpage download."""
331ce0a0 1558 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
d3975459 1559
4135fa45
WB
1560 def report_extraction(self, video_id):
1561 """Report information extraction."""
331ce0a0 1562 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
4135fa45 1563
4135fa45
WB
1564 def _real_extract(self, url):
1565 # Extract id and simplified title from URL
1566 mobj = re.match(self._VALID_URL, url)
1567 if mobj is None:
1568 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1569 return
1570
df372a65 1571 # At this point we have a new video
9bf7fa52 1572 self._downloader.increment_downloads()
4135fa45
WB
1573 video_id = mobj.group(1)
1574
1575 simple_title = mobj.group(2).decode('utf-8')
1576 video_extension = 'flv'
1577
1578 # Retrieve video webpage to extract further information
1579 request = urllib2.Request(url)
62a29bbf 1580 request.add_header('Cookie', 'family_filter=off')
4135fa45
WB
1581 try:
1582 self.report_download_webpage(video_id)
1583 webpage = urllib2.urlopen(request).read()
1584 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1586 return
1587
1588 # Extract URL, uploader and title from webpage
1589 self.report_extraction(video_id)
62a29bbf 1590 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
4135fa45
WB
1591 if mobj is None:
1592 self._downloader.trouble(u'ERROR: unable to extract media URL')
1593 return
62a29bbf 1594 sequence = urllib.unquote(mobj.group(1))
1595 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1596 if mobj is None:
1597 self._downloader.trouble(u'ERROR: unable to extract media URL')
1598 return
1599 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
4135fa45
WB
1600
1601 # if needed add http://www.dailymotion.com/ if relative URL
1602
1603 video_url = mediaURL
1604
62a29bbf 1605 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
4135fa45
WB
1606 if mobj is None:
1607 self._downloader.trouble(u'ERROR: unable to extract title')
1608 return
1609 video_title = mobj.group(1).decode('utf-8')
1610 video_title = sanitize_title(video_title)
1611
62a29bbf 1612 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
4135fa45
WB
1613 if mobj is None:
1614 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1615 return
1616 video_uploader = mobj.group(1)
1617
1618 try:
1619 # Process video information
1620 self._downloader.process_info({
1621 'id': video_id.decode('utf-8'),
1622 'url': video_url.decode('utf-8'),
1623 'uploader': video_uploader.decode('utf-8'),
138b11f3 1624 'upload_date': u'NA',
4135fa45
WB
1625 'title': video_title,
1626 'stitle': simple_title,
1627 'ext': video_extension.decode('utf-8'),
1628 'format': u'NA',
1629 'player_url': None,
1630 })
73f4e7af 1631 except UnavailableVideoError:
09cc744c 1632 self._downloader.trouble(u'\nERROR: unable to download video')
4135fa45 1633
c0a10ca8 1634
49c0028a 1635class GoogleIE(InfoExtractor):
1636 """Information extractor for video.google.com."""
1637
490fd7ae 1638 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
f3098c4d 1639 IE_NAME = u'video.google'
49c0028a 1640
1641 def __init__(self, downloader=None):
1642 InfoExtractor.__init__(self, downloader)
1643
49c0028a 1644 def report_download_webpage(self, video_id):
1645 """Report webpage download."""
331ce0a0 1646 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
49c0028a 1647
1648 def report_extraction(self, video_id):
1649 """Report information extraction."""
331ce0a0 1650 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
49c0028a 1651
49c0028a 1652 def _real_extract(self, url):
1653 # Extract id from URL
1654 mobj = re.match(self._VALID_URL, url)
1655 if mobj is None:
1656 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1657 return
1658
df372a65 1659 # At this point we have a new video
9bf7fa52 1660 self._downloader.increment_downloads()
49c0028a 1661 video_id = mobj.group(1)
1662
1663 video_extension = 'mp4'
1664
1665 # Retrieve video webpage to extract further information
490fd7ae 1666 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
49c0028a 1667 try:
1668 self.report_download_webpage(video_id)
1669 webpage = urllib2.urlopen(request).read()
1670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1672 return
1673
1674 # Extract URL, uploader, and title from webpage
1675 self.report_extraction(video_id)
490fd7ae
RG
1676 mobj = re.search(r"download_url:'([^']+)'", webpage)
1677 if mobj is None:
1678 video_extension = 'flv'
1679 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
49c0028a 1680 if mobj is None:
1681 self._downloader.trouble(u'ERROR: unable to extract media URL')
1682 return
1683 mediaURL = urllib.unquote(mobj.group(1))
1684 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1685 mediaURL = mediaURL.replace('\\x26', '\x26')
1686
1687 video_url = mediaURL
1688
1689 mobj = re.search(r'<title>(.*)</title>', webpage)
1690 if mobj is None:
1691 self._downloader.trouble(u'ERROR: unable to extract title')
1692 return
1693 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1694 video_title = sanitize_title(video_title)
e092418d 1695 simple_title = _simplify_title(video_title)
49c0028a 1696
7e58d568
RG
1697 # Extract video description
1698 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1699 if mobj is None:
1700 self._downloader.trouble(u'ERROR: unable to extract video description')
1701 return
1702 video_description = mobj.group(1).decode('utf-8')
1703 if not video_description:
1704 video_description = 'No description available.'
1705
1706 # Extract video thumbnail
1707 if self._downloader.params.get('forcethumbnail', False):
1708 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1709 try:
1710 webpage = urllib2.urlopen(request).read()
1711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1712 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1713 return
1714 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1715 if mobj is None:
1716 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1717 return
1718 video_thumbnail = mobj.group(1)
1719 else: # we need something to pass to process_info
1720 video_thumbnail = ''
1721
49c0028a 1722 try:
1723 # Process video information
1724 self._downloader.process_info({
1725 'id': video_id.decode('utf-8'),
1726 'url': video_url.decode('utf-8'),
6ba562b0 1727 'uploader': u'NA',
138b11f3 1728 'upload_date': u'NA',
490fd7ae 1729 'title': video_title,
31cbdaaf 1730 'stitle': simple_title,
49c0028a 1731 'ext': video_extension.decode('utf-8'),
6ba562b0 1732 'format': u'NA',
e616ec0c 1733 'player_url': None,
49c0028a 1734 })
73f4e7af 1735 except UnavailableVideoError:
09cc744c 1736 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 1737
1738
1739class PhotobucketIE(InfoExtractor):
1740 """Information extractor for photobucket.com."""
1741
1742 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
f3098c4d 1743 IE_NAME = u'photobucket'
49c0028a 1744
1745 def __init__(self, downloader=None):
1746 InfoExtractor.__init__(self, downloader)
1747
49c0028a 1748 def report_download_webpage(self, video_id):
1749 """Report webpage download."""
331ce0a0 1750 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
49c0028a 1751
1752 def report_extraction(self, video_id):
1753 """Report information extraction."""
331ce0a0 1754 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
49c0028a 1755
49c0028a 1756 def _real_extract(self, url):
1757 # Extract id from URL
1758 mobj = re.match(self._VALID_URL, url)
1759 if mobj is None:
1760 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1761 return
1762
df372a65 1763 # At this point we have a new video
9bf7fa52 1764 self._downloader.increment_downloads()
49c0028a 1765 video_id = mobj.group(1)
1766
1767 video_extension = 'flv'
1768
1769 # Retrieve video webpage to extract further information
1770 request = urllib2.Request(url)
1771 try:
1772 self.report_download_webpage(video_id)
1773 webpage = urllib2.urlopen(request).read()
1774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1776 return
1777
1778 # Extract URL, uploader, and title from webpage
1779 self.report_extraction(video_id)
1780 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1781 if mobj is None:
1782 self._downloader.trouble(u'ERROR: unable to extract media URL')
1783 return
1784 mediaURL = urllib.unquote(mobj.group(1))
1785
1786 video_url = mediaURL
1787
1788 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1789 if mobj is None:
1790 self._downloader.trouble(u'ERROR: unable to extract title')
1791 return
1792 video_title = mobj.group(1).decode('utf-8')
490fd7ae 1793 video_title = sanitize_title(video_title)
e092418d 1794 simple_title = _simplify_title(vide_title)
49c0028a 1795
1796 video_uploader = mobj.group(2).decode('utf-8')
1797
1798 try:
1799 # Process video information
1800 self._downloader.process_info({
1801 'id': video_id.decode('utf-8'),
1802 'url': video_url.decode('utf-8'),
490fd7ae 1803 'uploader': video_uploader,
138b11f3 1804 'upload_date': u'NA',
490fd7ae 1805 'title': video_title,
31cbdaaf 1806 'stitle': simple_title,
490fd7ae 1807 'ext': video_extension.decode('utf-8'),
6ba562b0 1808 'format': u'NA',
e616ec0c 1809 'player_url': None,
490fd7ae 1810 })
73f4e7af 1811 except UnavailableVideoError:
09cc744c 1812 self._downloader.trouble(u'\nERROR: unable to download video')
490fd7ae
RG
1813
1814
61945318
RG
1815class YahooIE(InfoExtractor):
1816 """Information extractor for video.yahoo.com."""
1817
1818 # _VALID_URL matches all Yahoo! Video URLs
1819 # _VPAGE_URL matches only the extractable '/watch/' URLs
1820 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1821 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
f3098c4d 1822 IE_NAME = u'video.yahoo'
61945318
RG
1823
1824 def __init__(self, downloader=None):
1825 InfoExtractor.__init__(self, downloader)
1826
61945318
RG
1827 def report_download_webpage(self, video_id):
1828 """Report webpage download."""
331ce0a0 1829 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
61945318
RG
1830
1831 def report_extraction(self, video_id):
1832 """Report information extraction."""
331ce0a0 1833 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
61945318 1834
df372a65 1835 def _real_extract(self, url, new_video=True):
61945318
RG
1836 # Extract ID from URL
1837 mobj = re.match(self._VALID_URL, url)
1838 if mobj is None:
1839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1840 return
1841
df372a65 1842 # At this point we have a new video
9bf7fa52 1843 self._downloader.increment_downloads()
61945318
RG
1844 video_id = mobj.group(2)
1845 video_extension = 'flv'
1846
1847 # Rewrite valid but non-extractable URLs as
1848 # extractable English language /watch/ URLs
1849 if re.match(self._VPAGE_URL, url) is None:
1850 request = urllib2.Request(url)
1851 try:
1852 webpage = urllib2.urlopen(request).read()
1853 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1854 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1855 return
1856
1857 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1858 if mobj is None:
1859 self._downloader.trouble(u'ERROR: Unable to extract id field')
1860 return
1861 yahoo_id = mobj.group(1)
1862
1863 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1864 if mobj is None:
1865 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1866 return
1867 yahoo_vid = mobj.group(1)
1868
1869 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
df372a65 1870 return self._real_extract(url, new_video=False)
61945318
RG
1871
1872 # Retrieve video webpage to extract further information
1873 request = urllib2.Request(url)
1874 try:
1875 self.report_download_webpage(video_id)
1876 webpage = urllib2.urlopen(request).read()
1877 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1878 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1879 return
1880
1881 # Extract uploader and title from webpage
1882 self.report_extraction(video_id)
1883 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1884 if mobj is None:
1885 self._downloader.trouble(u'ERROR: unable to extract video title')
1886 return
1887 video_title = mobj.group(1).decode('utf-8')
e092418d 1888 simple_title = _simplify_title(video_title)
61945318
RG
1889
1890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1891 if mobj is None:
1892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1893 return
1894 video_uploader = mobj.group(1).decode('utf-8')
1895
7e58d568
RG
1896 # Extract video thumbnail
1897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1898 if mobj is None:
1899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1900 return
1901 video_thumbnail = mobj.group(1).decode('utf-8')
1902
1903 # Extract video description
1904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1905 if mobj is None:
1906 self._downloader.trouble(u'ERROR: unable to extract video description')
1907 return
1908 video_description = mobj.group(1).decode('utf-8')
c0a10ca8
F
1909 if not video_description:
1910 video_description = 'No description available.'
7e58d568 1911
61945318
RG
1912 # Extract video height and width
1913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: unable to extract video height')
1916 return
1917 yv_video_height = mobj.group(1)
1918
1919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1920 if mobj is None:
1921 self._downloader.trouble(u'ERROR: unable to extract video width')
1922 return
1923 yv_video_width = mobj.group(1)
1924
1925 # Retrieve video playlist to extract media URL
1926 # I'm not completely sure what all these options are, but we
1927 # seem to need most of them, otherwise the server sends a 401.
1928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1930 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
c0a10ca8
F
1931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
61945318
RG
1933 try:
1934 self.report_download_webpage(video_id)
1935 webpage = urllib2.urlopen(request).read()
1936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1938 return
1939
1940 # Extract media URL from playlist XML
1941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1942 if mobj is None:
1943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1944 return
1945 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1946 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1947
1948 try:
1949 # Process video information
1950 self._downloader.process_info({
1951 'id': video_id.decode('utf-8'),
1952 'url': video_url,
1953 'uploader': video_uploader,
138b11f3 1954 'upload_date': u'NA',
61945318
RG
1955 'title': video_title,
1956 'stitle': simple_title,
1957 'ext': video_extension.decode('utf-8'),
7e58d568
RG
1958 'thumbnail': video_thumbnail.decode('utf-8'),
1959 'description': video_description,
1960 'thumbnail': video_thumbnail,
e616ec0c 1961 'player_url': None,
61945318 1962 })
73f4e7af 1963 except UnavailableVideoError:
09cc744c 1964 self._downloader.trouble(u'\nERROR: unable to download video')
61945318
RG
1965
1966
92743d42
RB
1967class VimeoIE(InfoExtractor):
1968 """Information extractor for vimeo.com."""
1969
1970 # _VALID_URL matches Vimeo URLs
44c636df 1971 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
f3098c4d 1972 IE_NAME = u'vimeo'
92743d42
RB
1973
1974 def __init__(self, downloader=None):
1975 InfoExtractor.__init__(self, downloader)
1976
92743d42
RB
1977 def report_download_webpage(self, video_id):
1978 """Report webpage download."""
0ecedbdb 1979 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
92743d42
RB
1980
1981 def report_extraction(self, video_id):
1982 """Report information extraction."""
0ecedbdb 1983 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
92743d42 1984
92743d42
RB
1985 def _real_extract(self, url, new_video=True):
1986 # Extract ID from URL
1987 mobj = re.match(self._VALID_URL, url)
1988 if mobj is None:
1989 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1990 return
1991
1992 # At this point we have a new video
1993 self._downloader.increment_downloads()
1994 video_id = mobj.group(1)
92743d42
RB
1995
1996 # Retrieve video webpage to extract further information
1997 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1998 try:
1999 self.report_download_webpage(video_id)
2000 webpage = urllib2.urlopen(request).read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2003 return
2004
f24c674b
RB
2005 # Now we begin extracting as much information as we can from what we
2006 # retrieved. First we extract the information common to all extractors,
2007 # and latter we extract those that are Vimeo specific.
92743d42 2008 self.report_extraction(video_id)
f24c674b
RB
2009
2010 # Extract title
c5a088d3 2011 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
92743d42
RB
2012 if mobj is None:
2013 self._downloader.trouble(u'ERROR: unable to extract video title')
2014 return
2015 video_title = mobj.group(1).decode('utf-8')
e092418d 2016 simple_title = _simple_title(video_title)
92743d42 2017
f24c674b 2018 # Extract uploader
c5a088d3 2019 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
92743d42
RB
2020 if mobj is None:
2021 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2022 return
2023 video_uploader = mobj.group(1).decode('utf-8')
2024
2025 # Extract video thumbnail
c5a088d3 2026 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
92743d42
RB
2027 if mobj is None:
2028 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2029 return
2030 video_thumbnail = mobj.group(1).decode('utf-8')
2031
2032 # # Extract video description
2033 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2034 # if mobj is None:
2035 # self._downloader.trouble(u'ERROR: unable to extract video description')
2036 # return
2037 # video_description = mobj.group(1).decode('utf-8')
2038 # if not video_description: video_description = 'No description available.'
2039 video_description = 'Foo.'
2040
f24c674b 2041 # Vimeo specific: extract request signature
c5a088d3 2042 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
92743d42
RB
2043 if mobj is None:
2044 self._downloader.trouble(u'ERROR: unable to extract request signature')
2045 return
2046 sig = mobj.group(1).decode('utf-8')
2047
c424df0d
RB
2048 # Vimeo specific: extract video quality information
2049 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2050 if mobj is None:
2051 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2052 return
2053 quality = mobj.group(1).decode('utf-8')
2054
2055 if int(quality) == 1:
2056 quality = 'hd'
2057 else:
2058 quality = 'sd'
2059
f24c674b 2060 # Vimeo specific: Extract request signature expiration
c5a088d3 2061 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
92743d42
RB
2062 if mobj is None:
2063 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2064 return
2065 sig_exp = mobj.group(1).decode('utf-8')
2066
c424df0d 2067 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
92743d42
RB
2068
2069 try:
2070 # Process video information
2071 self._downloader.process_info({
2072 'id': video_id.decode('utf-8'),
2073 'url': video_url,
2074 'uploader': video_uploader,
2075 'upload_date': u'NA',
2076 'title': video_title,
2077 'stitle': simple_title,
2fc31a48 2078 'ext': u'mp4',
92743d42
RB
2079 'thumbnail': video_thumbnail.decode('utf-8'),
2080 'description': video_description,
2081 'thumbnail': video_thumbnail,
2082 'description': video_description,
2083 'player_url': None,
2084 })
2085 except UnavailableVideoError:
2086 self._downloader.trouble(u'ERROR: unable to download video')
2087
2088
490fd7ae
RG
2089class GenericIE(InfoExtractor):
2090 """Generic last-resort information extractor."""
2091
f3098c4d
PH
2092 _VALID_URL = r'.*'
2093 IE_NAME = u'generic'
bdb3f7a7 2094
490fd7ae
RG
2095 def __init__(self, downloader=None):
2096 InfoExtractor.__init__(self, downloader)
2097
490fd7ae
RG
2098 def report_download_webpage(self, video_id):
2099 """Report webpage download."""
331ce0a0
RG
2100 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2101 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
490fd7ae
RG
2102
2103 def report_extraction(self, video_id):
2104 """Report information extraction."""
331ce0a0 2105 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
490fd7ae 2106
490fd7ae 2107 def _real_extract(self, url):
df372a65 2108 # At this point we have a new video
9bf7fa52 2109 self._downloader.increment_downloads()
df372a65 2110
490fd7ae
RG
2111 video_id = url.split('/')[-1]
2112 request = urllib2.Request(url)
2113 try:
2114 self.report_download_webpage(video_id)
2115 webpage = urllib2.urlopen(request).read()
2116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2118 return
2119 except ValueError, err:
2120 # since this is the last-resort InfoExtractor, if
2121 # this error is thrown, it'll be thrown here
2122 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2123 return
2124
a9806fd8 2125 self.report_extraction(video_id)
490fd7ae
RG
2126 # Start with something easy: JW Player in SWFObject
2127 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2128 if mobj is None:
2129 # Broaden the search a little bit
2130 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2131 if mobj is None:
2132 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2133 return
2134
2135 # It's possible that one of the regexes
2136 # matched, but returned an empty group:
2137 if mobj.group(1) is None:
2138 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2139 return
2140
2141 video_url = urllib.unquote(mobj.group(1))
c0a10ca8 2142 video_id = os.path.basename(video_url)
490fd7ae
RG
2143
2144 # here's a fun little line of code for you:
2145 video_extension = os.path.splitext(video_id)[1][1:]
c0a10ca8 2146 video_id = os.path.splitext(video_id)[0]
490fd7ae
RG
2147
2148 # it's tempting to parse this further, but you would
2149 # have to take into account all the variations like
2150 # Video Title - Site Name
2151 # Site Name | Video Title
2152 # Video Title - Tagline | Site Name
2153 # and so on and so forth; it's just not practical
2154 mobj = re.search(r'<title>(.*)</title>', webpage)
2155 if mobj is None:
2156 self._downloader.trouble(u'ERROR: unable to extract title')
2157 return
2158 video_title = mobj.group(1).decode('utf-8')
2159 video_title = sanitize_title(video_title)
e092418d 2160 simple_title = _simplify_title(video_title)
490fd7ae
RG
2161
2162 # video uploader is domain name
2163 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2164 if mobj is None:
2165 self._downloader.trouble(u'ERROR: unable to extract title')
2166 return
2167 video_uploader = mobj.group(1).decode('utf-8')
2168
2169 try:
2170 # Process video information
2171 self._downloader.process_info({
2172 'id': video_id.decode('utf-8'),
2173 'url': video_url.decode('utf-8'),
2174 'uploader': video_uploader,
138b11f3 2175 'upload_date': u'NA',
490fd7ae 2176 'title': video_title,
31cbdaaf 2177 'stitle': simple_title,
49c0028a 2178 'ext': video_extension.decode('utf-8'),
6ba562b0 2179 'format': u'NA',
e616ec0c 2180 'player_url': None,
49c0028a 2181 })
73f4e7af 2182 except UnavailableVideoError, err:
09cc744c 2183 self._downloader.trouble(u'\nERROR: unable to download video')
49c0028a 2184
2185
25af2bce
RG
2186class YoutubeSearchIE(InfoExtractor):
2187 """Information Extractor for YouTube search queries."""
bdb3f7a7 2188 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
25af2bce
RG
2189 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2190 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
304a4d85 2191 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
25af2bce 2192 _youtube_ie = None
fd9288c3 2193 _max_youtube_results = 1000
f3098c4d 2194 IE_NAME = u'youtube:search'
25af2bce 2195
f995f712 2196 def __init__(self, youtube_ie, downloader=None):
25af2bce
RG
2197 InfoExtractor.__init__(self, downloader)
2198 self._youtube_ie = youtube_ie
d3975459 2199
25af2bce
RG
2200 def report_download_page(self, query, pagenum):
2201 """Report attempt to download playlist page with given number."""
490fd7ae 2202 query = query.decode(preferredencoding())
331ce0a0 2203 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
25af2bce
RG
2204
2205 def _real_initialize(self):
2206 self._youtube_ie.initialize()
d3975459 2207
25af2bce 2208 def _real_extract(self, query):
bdb3f7a7 2209 mobj = re.match(self._VALID_URL, query)
25af2bce 2210 if mobj is None:
147753eb 2211 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
6f21f686 2212 return
25af2bce
RG
2213
2214 prefix, query = query.split(':')
2215 prefix = prefix[8:]
c0a10ca8 2216 query = query.encode('utf-8')
f995f712 2217 if prefix == '':
6f21f686
RG
2218 self._download_n_results(query, 1)
2219 return
f995f712 2220 elif prefix == 'all':
6f21f686
RG
2221 self._download_n_results(query, self._max_youtube_results)
2222 return
f995f712 2223 else:
25af2bce 2224 try:
e1f18b8a 2225 n = long(prefix)
25af2bce 2226 if n <= 0:
147753eb 2227 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
6f21f686 2228 return
257453b9 2229 elif n > self._max_youtube_results:
c0a10ca8 2230 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
257453b9 2231 n = self._max_youtube_results
6f21f686
RG
2232 self._download_n_results(query, n)
2233 return
e1f18b8a 2234 except ValueError: # parsing prefix as integer fails
6f21f686
RG
2235 self._download_n_results(query, 1)
2236 return
25af2bce
RG
2237
2238 def _download_n_results(self, query, n):
2239 """Downloads a specified number of results for a query"""
2240
2241 video_ids = []
2242 already_seen = set()
2243 pagenum = 1
2244
2245 while True:
2246 self.report_download_page(query, pagenum)
a9633f14 2247 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2248 request = urllib2.Request(result_url)
25af2bce
RG
2249 try:
2250 page = urllib2.urlopen(request).read()
2251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2252 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2253 return
25af2bce
RG
2254
2255 # Extract video identifiers
2256 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2257 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2258 if video_id not in already_seen:
2259 video_ids.append(video_id)
2260 already_seen.add(video_id)
2261 if len(video_ids) == n:
2262 # Specified n videos reached
25af2bce 2263 for id in video_ids:
6f21f686
RG
2264 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2265 return
25af2bce 2266
304a4d85 2267 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
25af2bce 2268 for id in video_ids:
6f21f686
RG
2269 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2270 return
25af2bce
RG
2271
2272 pagenum = pagenum + 1
2273
c0a10ca8 2274
7e58d568
RG
2275class GoogleSearchIE(InfoExtractor):
2276 """Information Extractor for Google Video search queries."""
bdb3f7a7 2277 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
7e58d568
RG
2278 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2279 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2280 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2281 _google_ie = None
2282 _max_google_results = 1000
f3098c4d 2283 IE_NAME = u'video.google:search'
7e58d568
RG
2284
2285 def __init__(self, google_ie, downloader=None):
2286 InfoExtractor.__init__(self, downloader)
2287 self._google_ie = google_ie
d3975459 2288
7e58d568
RG
2289 def report_download_page(self, query, pagenum):
2290 """Report attempt to download playlist page with given number."""
2291 query = query.decode(preferredencoding())
331ce0a0 2292 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2293
2294 def _real_initialize(self):
2295 self._google_ie.initialize()
d3975459 2296
7e58d568 2297 def _real_extract(self, query):
bdb3f7a7 2298 mobj = re.match(self._VALID_URL, query)
7e58d568
RG
2299 if mobj is None:
2300 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2301 return
2302
2303 prefix, query = query.split(':')
2304 prefix = prefix[8:]
c0a10ca8 2305 query = query.encode('utf-8')
7e58d568
RG
2306 if prefix == '':
2307 self._download_n_results(query, 1)
2308 return
2309 elif prefix == 'all':
2310 self._download_n_results(query, self._max_google_results)
2311 return
2312 else:
2313 try:
2314 n = long(prefix)
2315 if n <= 0:
2316 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2317 return
2318 elif n > self._max_google_results:
c0a10ca8 2319 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
7e58d568
RG
2320 n = self._max_google_results
2321 self._download_n_results(query, n)
2322 return
2323 except ValueError: # parsing prefix as integer fails
2324 self._download_n_results(query, 1)
2325 return
2326
2327 def _download_n_results(self, query, n):
2328 """Downloads a specified number of results for a query"""
2329
2330 video_ids = []
2331 already_seen = set()
2332 pagenum = 1
2333
2334 while True:
2335 self.report_download_page(query, pagenum)
2336 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2337 request = urllib2.Request(result_url)
7e58d568
RG
2338 try:
2339 page = urllib2.urlopen(request).read()
2340 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2341 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2342 return
2343
2344 # Extract video identifiers
2345 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2346 video_id = mobj.group(1)
2347 if video_id not in already_seen:
2348 video_ids.append(video_id)
2349 already_seen.add(video_id)
2350 if len(video_ids) == n:
2351 # Specified n videos reached
2352 for id in video_ids:
2353 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2354 return
2355
2356 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2357 for id in video_ids:
2358 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2359 return
2360
2361 pagenum = pagenum + 1
2362
c0a10ca8 2363
7e58d568
RG
2364class YahooSearchIE(InfoExtractor):
2365 """Information Extractor for Yahoo! Video search queries."""
bdb3f7a7 2366 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
7e58d568
RG
2367 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2368 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2369 _MORE_PAGES_INDICATOR = r'\s*Next'
2370 _yahoo_ie = None
2371 _max_yahoo_results = 1000
f3098c4d 2372 IE_NAME = u'video.yahoo:search'
7e58d568
RG
2373
2374 def __init__(self, yahoo_ie, downloader=None):
2375 InfoExtractor.__init__(self, downloader)
2376 self._yahoo_ie = yahoo_ie
d3975459 2377
7e58d568
RG
2378 def report_download_page(self, query, pagenum):
2379 """Report attempt to download playlist page with given number."""
2380 query = query.decode(preferredencoding())
331ce0a0 2381 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
7e58d568
RG
2382
2383 def _real_initialize(self):
2384 self._yahoo_ie.initialize()
d3975459 2385
7e58d568 2386 def _real_extract(self, query):
bdb3f7a7 2387 mobj = re.match(self._VALID_URL, query)
7e58d568
RG
2388 if mobj is None:
2389 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2390 return
2391
2392 prefix, query = query.split(':')
2393 prefix = prefix[8:]
c0a10ca8 2394 query = query.encode('utf-8')
7e58d568
RG
2395 if prefix == '':
2396 self._download_n_results(query, 1)
2397 return
2398 elif prefix == 'all':
2399 self._download_n_results(query, self._max_yahoo_results)
2400 return
2401 else:
2402 try:
2403 n = long(prefix)
2404 if n <= 0:
2405 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2406 return
2407 elif n > self._max_yahoo_results:
c0a10ca8 2408 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
7e58d568
RG
2409 n = self._max_yahoo_results
2410 self._download_n_results(query, n)
2411 return
2412 except ValueError: # parsing prefix as integer fails
2413 self._download_n_results(query, 1)
2414 return
2415
2416 def _download_n_results(self, query, n):
2417 """Downloads a specified number of results for a query"""
2418
2419 video_ids = []
2420 already_seen = set()
2421 pagenum = 1
2422
2423 while True:
2424 self.report_download_page(query, pagenum)
2425 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987c232 2426 request = urllib2.Request(result_url)
7e58d568
RG
2427 try:
2428 page = urllib2.urlopen(request).read()
2429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2430 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2431 return
2432
2433 # Extract video identifiers
2434 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2435 video_id = mobj.group(1)
2436 if video_id not in already_seen:
2437 video_ids.append(video_id)
2438 already_seen.add(video_id)
2439 if len(video_ids) == n:
2440 # Specified n videos reached
2441 for id in video_ids:
2442 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2443 return
2444
2445 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2446 for id in video_ids:
2447 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2448 return
2449
2450 pagenum = pagenum + 1
2451
c0a10ca8 2452
0c2dc87d
RG
2453class YoutubePlaylistIE(InfoExtractor):
2454 """Information Extractor for YouTube playlists."""
2455
c3e4e7c1 2456 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
f74e22ae 2457 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
0c2dc87d 2458 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
ce5cafea 2459 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
0c2dc87d 2460 _youtube_ie = None
f3098c4d 2461 IE_NAME = u'youtube:playlist'
0c2dc87d
RG
2462
2463 def __init__(self, youtube_ie, downloader=None):
2464 InfoExtractor.__init__(self, downloader)
2465 self._youtube_ie = youtube_ie
d3975459 2466
0c2dc87d
RG
2467 def report_download_page(self, playlist_id, pagenum):
2468 """Report attempt to download playlist page with given number."""
331ce0a0 2469 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
0c2dc87d
RG
2470
2471 def _real_initialize(self):
2472 self._youtube_ie.initialize()
d3975459 2473
0c2dc87d
RG
2474 def _real_extract(self, url):
2475 # Extract playlist id
2476 mobj = re.match(self._VALID_URL, url)
2477 if mobj is None:
147753eb 2478 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
6f21f686 2479 return
0c2dc87d 2480
d119b54d
RG
2481 # Single video case
2482 if mobj.group(3) is not None:
2483 self._youtube_ie.extract(mobj.group(3))
2484 return
2485
0c2dc87d 2486 # Download playlist pages
f74e22ae
GI
2487 # prefix is 'p' as default for playlists but there are other types that need extra care
2488 playlist_prefix = mobj.group(1)
2489 if playlist_prefix == 'a':
2490 playlist_access = 'artist'
2491 else:
7cc3c6fd 2492 playlist_prefix = 'p'
f74e22ae
GI
2493 playlist_access = 'view_play_list'
2494 playlist_id = mobj.group(2)
0c2dc87d
RG
2495 video_ids = []
2496 pagenum = 1
2497
2498 while True:
2499 self.report_download_page(playlist_id, pagenum)
c3e4e7c1
PH
2500 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2501 request = urllib2.Request(url)
0c2dc87d
RG
2502 try:
2503 page = urllib2.urlopen(request).read()
2504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
147753eb 2505 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
6f21f686 2506 return
0c2dc87d
RG
2507
2508 # Extract video identifiers
27d98b6e 2509 ids_in_page = []
0c2dc87d 2510 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
27d98b6e
RG
2511 if mobj.group(1) not in ids_in_page:
2512 ids_in_page.append(mobj.group(1))
2513 video_ids.extend(ids_in_page)
0c2dc87d 2514
ce5cafea 2515 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
0c2dc87d
RG
2516 break
2517 pagenum = pagenum + 1
2518
8cc44341
RG
2519 playliststart = self._downloader.params.get('playliststart', 1) - 1
2520 playlistend = self._downloader.params.get('playlistend', -1)
2521 video_ids = video_ids[playliststart:playlistend]
2522
0c2dc87d 2523 for id in video_ids:
6f21f686
RG
2524 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2525 return
0c2dc87d 2526
c0a10ca8 2527
c39c05cd
A
2528class YoutubeUserIE(InfoExtractor):
2529 """Information Extractor for YouTube users."""
2530
b845d58b 2531 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
c39c05cd 2532 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
5aba6ea4
RG
2533 _GDATA_PAGE_SIZE = 50
2534 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
abeac45a 2535 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
c39c05cd 2536 _youtube_ie = None
f3098c4d 2537 IE_NAME = u'youtube:user'
c39c05cd
A
2538
2539 def __init__(self, youtube_ie, downloader=None):
2540 InfoExtractor.__init__(self, downloader)
2541 self._youtube_ie = youtube_ie
d3975459 2542
5aba6ea4 2543 def report_download_page(self, username, start_index):
c39c05cd 2544 """Report attempt to download user page."""
5aba6ea4 2545 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
c0a10ca8 2546 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
c39c05cd
A
2547
2548 def _real_initialize(self):
2549 self._youtube_ie.initialize()
d3975459 2550
c39c05cd
A
2551 def _real_extract(self, url):
2552 # Extract username
2553 mobj = re.match(self._VALID_URL, url)
2554 if mobj is None:
2555 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2556 return
2557
c39c05cd 2558 username = mobj.group(1)
5aba6ea4
RG
2559
2560 # Download video ids using YouTube Data API. Result size per
2561 # query is limited (currently to 50 videos) so we need to query
2562 # page by page until there are no video ids - it means we got
2563 # all of them.
2564
c39c05cd 2565 video_ids = []
5aba6ea4 2566 pagenum = 0
c39c05cd 2567
5aba6ea4
RG
2568 while True:
2569 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2570 self.report_download_page(username, start_index)
c39c05cd 2571
5aba6ea4 2572 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
c39c05cd 2573
5aba6ea4
RG
2574 try:
2575 page = urllib2.urlopen(request).read()
2576 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2577 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2578 return
c39c05cd 2579
5aba6ea4
RG
2580 # Extract video identifiers
2581 ids_in_page = []
2582
2583 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2584 if mobj.group(1) not in ids_in_page:
2585 ids_in_page.append(mobj.group(1))
2586
2587 video_ids.extend(ids_in_page)
2588
2589 # A little optimization - if current page is not
2590 # "full", ie. does not contain PAGE_SIZE video ids then
2591 # we can assume that this page is the last one - there
2592 # are no more ids on further pages - no need to query
2593 # again.
2594
2595 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2596 break
2597
2598 pagenum += 1
2599
2600 all_ids_count = len(video_ids)
8cc44341
RG
2601 playliststart = self._downloader.params.get('playliststart', 1) - 1
2602 playlistend = self._downloader.params.get('playlistend', -1)
204c9398 2603
5aba6ea4
RG
2604 if playlistend == -1:
2605 video_ids = video_ids[playliststart:]
2606 else:
2607 video_ids = video_ids[playliststart:playlistend]
7a9054ec 2608
5aba6ea4 2609 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
c0a10ca8 2610 (username, all_ids_count, len(video_ids)))
5aba6ea4
RG
2611
2612 for video_id in video_ids:
2613 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2614
c39c05cd 2615
27179cfd
VV
2616class DepositFilesIE(InfoExtractor):
2617 """Information extractor for depositfiles.com"""
2618
b845d58b 2619 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
f3098c4d 2620 IE_NAME = u'DepositFiles'
27179cfd
VV
2621
2622 def __init__(self, downloader=None):
2623 InfoExtractor.__init__(self, downloader)
2624
27179cfd
VV
2625 def report_download_webpage(self, file_id):
2626 """Report webpage download."""
2627 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2628
2629 def report_extraction(self, file_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2632
27179cfd
VV
2633 def _real_extract(self, url):
2634 # At this point we have a new file
2635 self._downloader.increment_downloads()
2636
2637 file_id = url.split('/')[-1]
2638 # Rebuild url in english locale
2639 url = 'http://depositfiles.com/en/files/' + file_id
2640
2641 # Retrieve file webpage with 'Free download' button pressed
2642 free_download_indication = { 'gateway_result' : '1' }
1987c232 2643 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
27179cfd
VV
2644 try:
2645 self.report_download_webpage(file_id)
2646 webpage = urllib2.urlopen(request).read()
2647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2648 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2649 return
2650
2651 # Search for the real file URL
2652 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2653 if (mobj is None) or (mobj.group(1) is None):
2654 # Try to figure out reason of the error.
2655 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2656 if (mobj is not None) and (mobj.group(1) is not None):
2657 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2658 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2659 else:
2660 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2661 return
2662
2663 file_url = mobj.group(1)
2664 file_extension = os.path.splitext(file_url)[1][1:]
2665
2666 # Search for file title
2667 mobj = re.search(r'<b title="(.*?)">', webpage)
2668 if mobj is None:
2669 self._downloader.trouble(u'ERROR: unable to extract title')
2670 return
2671 file_title = mobj.group(1).decode('utf-8')
2672
2673 try:
2674 # Process file information
2675 self._downloader.process_info({
2676 'id': file_id.decode('utf-8'),
2677 'url': file_url.decode('utf-8'),
2678 'uploader': u'NA',
2679 'upload_date': u'NA',
2680 'title': file_title,
2681 'stitle': file_title,
2682 'ext': file_extension.decode('utf-8'),
2683 'format': u'NA',
2684 'player_url': None,
2685 })
2686 except UnavailableVideoError, err:
2687 self._downloader.trouble(u'ERROR: unable to download file')
2688
c0a10ca8 2689
9f5f9602
GI
2690class FacebookIE(InfoExtractor):
2691 """Information Extractor for Facebook"""
2692
857e5f32 2693 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
9f5f9602
GI
2694 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2695 _NETRC_MACHINE = 'facebook'
0067bbe7 2696 _available_formats = ['video', 'highqual', 'lowqual']
9f5f9602 2697 _video_extensions = {
0067bbe7 2698 'video': 'mp4',
9f5f9602
GI
2699 'highqual': 'mp4',
2700 'lowqual': 'mp4',
2701 }
f3098c4d 2702 IE_NAME = u'facebook'
9f5f9602
GI
2703
2704 def __init__(self, downloader=None):
2705 InfoExtractor.__init__(self, downloader)
2706
9f5f9602
GI
2707 def _reporter(self, message):
2708 """Add header and report message."""
2709 self._downloader.to_screen(u'[facebook] %s' % message)
2710
2711 def report_login(self):
2712 """Report attempt to log in."""
2713 self._reporter(u'Logging in')
2714
2715 def report_video_webpage_download(self, video_id):
2716 """Report attempt to download video webpage."""
2717 self._reporter(u'%s: Downloading video webpage' % video_id)
2718
2719 def report_information_extraction(self, video_id):
2720 """Report attempt to extract video information."""
2721 self._reporter(u'%s: Extracting video information' % video_id)
2722
2723 def _parse_page(self, video_webpage):
2724 """Extract video information from page"""
2725 # General data
99e207ba 2726 data = {'title': r'\("video_title", "(.*?)"\)',
9f5f9602
GI
2727 'description': r'<div class="datawrap">(.*?)</div>',
2728 'owner': r'\("video_owner_name", "(.*?)"\)',
9f5f9602
GI
2729 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2730 }
2731 video_info = {}
2732 for piece in data.keys():
2733 mobj = re.search(data[piece], video_webpage)
2734 if mobj is not None:
2735 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2736
2737 # Video urls
2738 video_urls = {}
2739 for fmt in self._available_formats:
2740 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2741 if mobj is not None:
2742 # URL is in a Javascript segment inside an escaped Unicode format within
2743 # the generally utf-8 page
2744 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2745 video_info['video_urls'] = video_urls
2746
2747 return video_info
2748
2749 def _real_initialize(self):
2750 if self._downloader is None:
2751 return
2752
2753 useremail = None
2754 password = None
2755 downloader_params = self._downloader.params
2756
2757 # Attempt to use provided username and password or .netrc data
2758 if downloader_params.get('username', None) is not None:
2759 useremail = downloader_params['username']
2760 password = downloader_params['password']
2761 elif downloader_params.get('usenetrc', False):
2762 try:
2763 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2764 if info is not None:
2765 useremail = info[0]
2766 password = info[2]
2767 else:
2768 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2769 except (IOError, netrc.NetrcParseError), err:
2770 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2771 return
2772
2773 if useremail is None:
2774 return
2775
2776 # Log in
2777 login_form = {
2778 'email': useremail,
2779 'pass': password,
2780 'login': 'Log+In'
2781 }
2782 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2783 try:
2784 self.report_login()
2785 login_results = urllib2.urlopen(request).read()
2786 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2787 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2788 return
2789 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2790 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2791 return
2792
2793 def _real_extract(self, url):
2794 mobj = re.match(self._VALID_URL, url)
2795 if mobj is None:
2796 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797 return
2798 video_id = mobj.group('ID')
2799
2800 # Get video webpage
2801 self.report_video_webpage_download(video_id)
2802 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2803 try:
2804 page = urllib2.urlopen(request)
2805 video_webpage = page.read()
2806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2807 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2808 return
2809
2810 # Start extracting information
2811 self.report_information_extraction(video_id)
2812
2813 # Extract information
2814 video_info = self._parse_page(video_webpage)
2815
2816 # uploader
2817 if 'owner' not in video_info:
2818 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2819 return
2820 video_uploader = video_info['owner']
2821
2822 # title
2823 if 'title' not in video_info:
2824 self._downloader.trouble(u'ERROR: unable to extract video title')
2825 return
2826 video_title = video_info['title']
2827 video_title = video_title.decode('utf-8')
2828 video_title = sanitize_title(video_title)
2829
e092418d 2830 simple_title = _simplify_title(video_title)
9f5f9602
GI
2831
2832 # thumbnail image
2833 if 'thumbnail' not in video_info:
2834 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2835 video_thumbnail = ''
2836 else:
2837 video_thumbnail = video_info['thumbnail']
2838
2839 # upload date
2840 upload_date = u'NA'
2841 if 'upload_date' in video_info:
2842 upload_time = video_info['upload_date']
2843 timetuple = email.utils.parsedate_tz(upload_time)
2844 if timetuple is not None:
2845 try:
2846 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2847 except:
2848 pass
2849
2850 # description
8b95c387 2851 video_description = video_info.get('description', 'No description available.')
9f5f9602
GI
2852
2853 url_map = video_info['video_urls']
2854 if len(url_map.keys()) > 0:
2855 # Decide which formats to download
2856 req_format = self._downloader.params.get('format', None)
2857 format_limit = self._downloader.params.get('format_limit', None)
2858
2859 if format_limit is not None and format_limit in self._available_formats:
2860 format_list = self._available_formats[self._available_formats.index(format_limit):]
2861 else:
2862 format_list = self._available_formats
2863 existing_formats = [x for x in format_list if x in url_map]
2864 if len(existing_formats) == 0:
2865 self._downloader.trouble(u'ERROR: no known formats available for video')
2866 return
2867 if req_format is None:
2868 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
c52b01f3
K
2869 elif req_format == 'worst':
2870 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
9f5f9602
GI
2871 elif req_format == '-1':
2872 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2873 else:
2874 # Specific format
2875 if req_format not in url_map:
2876 self._downloader.trouble(u'ERROR: requested format not available')
2877 return
2878 video_url_list = [(req_format, url_map[req_format])] # Specific format
2879
2880 for format_param, video_real_url in video_url_list:
2881
2882 # At this point we have a new video
2883 self._downloader.increment_downloads()
2884
2885 # Extension
2886 video_extension = self._video_extensions.get(format_param, 'mp4')
2887
9f5f9602
GI
2888 try:
2889 # Process video information
2890 self._downloader.process_info({
2891 'id': video_id.decode('utf-8'),
2892 'url': video_real_url.decode('utf-8'),
2893 'uploader': video_uploader.decode('utf-8'),
2894 'upload_date': upload_date,
2895 'title': video_title,
2896 'stitle': simple_title,
2897 'ext': video_extension.decode('utf-8'),
2898 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2899 'thumbnail': video_thumbnail.decode('utf-8'),
2900 'description': video_description.decode('utf-8'),
2901 'player_url': None,
2902 })
2903 except UnavailableVideoError, err:
2904 self._downloader.trouble(u'\nERROR: unable to download video')
2905
7745f5d8
PH
2906class BlipTVIE(InfoExtractor):
2907 """Information extractor for blip.tv"""
2908
1cab2c6d 2909 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
7745f5d8 2910 _URL_EXT = r'^.*\.([a-z0-9]+)$'
f3098c4d 2911 IE_NAME = u'blip.tv'
7745f5d8 2912
7745f5d8
PH
2913 def report_extraction(self, file_id):
2914 """Report information extraction."""
54f329fe
PH
2915 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2916
2917 def report_direct_download(self, title):
2918 """Report information extraction."""
2919 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
7745f5d8 2920
7745f5d8
PH
2921 def _real_extract(self, url):
2922 mobj = re.match(self._VALID_URL, url)
2923 if mobj is None:
2924 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2925 return
2926
1293ce58
PH
2927 if '?' in url:
2928 cchar = '&'
2929 else:
2930 cchar = '?'
2931 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
7745f5d8 2932 request = urllib2.Request(json_url)
aded78d9 2933 self.report_extraction(mobj.group(1))
54f329fe 2934 info = None
7745f5d8 2935 try:
54f329fe
PH
2936 urlh = urllib2.urlopen(request)
2937 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2938 basename = url.split('/')[-1]
2939 title,ext = os.path.splitext(basename)
2940 ext = ext.replace('.', '')
2941 self.report_direct_download(title)
2942 info = {
2943 'id': title,
2944 'url': url,
2945 'title': title,
e092418d 2946 'stitle': _simplify_title(title),
54f329fe
PH
2947 'ext': ext,
2948 'urlhandle': urlh
2949 }
7745f5d8
PH
2950 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2951 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2952 return
54f329fe
PH
2953 if info is None: # Regular URL
2954 try:
2955 json_code = urlh.read()
2956 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2957 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2958 return
7745f5d8 2959
54f329fe
PH
2960 try:
2961 json_data = json.loads(json_code)
2962 if 'Post' in json_data:
2963 data = json_data['Post']
2964 else:
2965 data = json_data
2966
2967 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2968 video_url = data['media']['url']
2969 umobj = re.match(self._URL_EXT, video_url)
2970 if umobj is None:
2971 raise ValueError('Can not determine filename extension')
2972 ext = umobj.group(1)
2973
2974 info = {
2975 'id': data['item_id'],
2976 'url': video_url,
2977 'uploader': data['display_name'],
2978 'upload_date': upload_date,
2979 'title': data['title'],
e092418d 2980 'stitle': _simplify_title(data['title']),
54f329fe
PH
2981 'ext': ext,
2982 'format': data['media']['mimeType'],
2983 'thumbnail': data['thumbnailUrl'],
2984 'description': data['description'],
2985 'player_url': data['embedUrl']
2986 }
2987 except (ValueError,KeyError), err:
2988 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2989 return
a1cab7ce 2990
54f329fe 2991 self._downloader.increment_downloads()
7745f5d8
PH
2992
2993 try:
2994 self._downloader.process_info(info)
2995 except UnavailableVideoError, err:
2996 self._downloader.trouble(u'\nERROR: unable to download video')
2997
2998
9b0a8bc1
PH
2999class MyVideoIE(InfoExtractor):
3000 """Information Extractor for myvideo.de."""
3001
3002 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
f3098c4d 3003 IE_NAME = u'myvideo'
9b0a8bc1
PH
3004
3005 def __init__(self, downloader=None):
3006 InfoExtractor.__init__(self, downloader)
3007
9b0a8bc1
PH
3008 def report_download_webpage(self, video_id):
3009 """Report webpage download."""
3010 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3011
3012 def report_extraction(self, video_id):
3013 """Report information extraction."""
3014 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3015
9b0a8bc1
PH
3016 def _real_extract(self,url):
3017 mobj = re.match(self._VALID_URL, url)
3018 if mobj is None:
3019 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3020 return
3021
3022 video_id = mobj.group(1)
9b0a8bc1
PH
3023
3024 # Get video webpage
3025 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3026 try:
3027 self.report_download_webpage(video_id)
3028 webpage = urllib2.urlopen(request).read()
3029 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3030 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3031 return
3032
3033 self.report_extraction(video_id)
3034 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3035 webpage)
3036 if mobj is None:
3037 self._downloader.trouble(u'ERROR: unable to extract media URL')
3038 return
3039 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3040
3041 mobj = re.search('<title>([^<]+)</title>', webpage)
3042 if mobj is None:
3043 self._downloader.trouble(u'ERROR: unable to extract title')
3044 return
3045
3046 video_title = mobj.group(1)
3047 video_title = sanitize_title(video_title)
3048
e092418d
PH
3049 simple_title = _simplify_title(video_title)
3050
9b0a8bc1 3051 try:
9b0a8bc1
PH
3052 self._downloader.process_info({
3053 'id': video_id,
3054 'url': video_url,
3055 'uploader': u'NA',
3056 'upload_date': u'NA',
3057 'title': video_title,
3058 'stitle': simple_title,
3059 'ext': u'flv',
3060 'format': u'NA',
3061 'player_url': None,
3062 })
3063 except UnavailableVideoError:
3064 self._downloader.trouble(u'\nERROR: Unable to download video')
3065
c8e30044 3066class ComedyCentralIE(InfoExtractor):
f166bccc 3067 """Information extractor for The Daily Show and Colbert Report """
c8e30044 3068
f3098c4d
PH
3069 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3070 IE_NAME = u'comedycentral'
c8e30044 3071
c8e30044
PH
3072 def report_extraction(self, episode_id):
3073 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3074
3075 def report_config_download(self, episode_id):
3076 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3077
b487ef08
PH
3078 def report_index_download(self, episode_id):
3079 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3080
fedf9f39
PH
3081 def report_player_url(self, episode_id):
3082 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3083
c8e30044
PH
3084 def _real_extract(self, url):
3085 mobj = re.match(self._VALID_URL, url)
3086 if mobj is None:
3087 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3088 return
f166bccc
PH
3089
3090 if mobj.group('shortname'):
3091 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3092 url = 'http://www.thedailyshow.com/full-episodes/'
3093 else:
3094 url = 'http://www.colbertnation.com/full-episodes/'
3095 mobj = re.match(self._VALID_URL, url)
3096 assert mobj is not None
3097
3098 dlNewest = not mobj.group('episode')
3099 if dlNewest:
3100 epTitle = mobj.group('showname')
3101 else:
3102 epTitle = mobj.group('episode')
c8e30044
PH
3103
3104 req = urllib2.Request(url)
3105 self.report_extraction(epTitle)
3106 try:
f166bccc
PH
3107 htmlHandle = urllib2.urlopen(req)
3108 html = htmlHandle.read()
c8e30044
PH
3109 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3110 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3111 return
f166bccc
PH
3112 if dlNewest:
3113 url = htmlHandle.geturl()
3114 mobj = re.match(self._VALID_URL, url)
3115 if mobj is None:
3116 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3117 return
3118 if mobj.group('episode') == '':
3119 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3120 return
3121 epTitle = mobj.group('episode')
c8e30044 3122
b487ef08 3123 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
c8e30044
PH
3124 if len(mMovieParams) == 0:
3125 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3126 return
b487ef08
PH
3127
3128 playerUrl_raw = mMovieParams[0][0]
fedf9f39
PH
3129 self.report_player_url(epTitle)
3130 try:
b487ef08
PH
3131 urlHandle = urllib2.urlopen(playerUrl_raw)
3132 playerUrl = urlHandle.geturl()
3133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3135 return
3136
3137 uri = mMovieParams[0][1]
3138 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3139 self.report_index_download(epTitle)
3140 try:
3141 indexXml = urllib2.urlopen(indexUrl).read()
fedf9f39 3142 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
b487ef08 3143 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
fedf9f39 3144 return
fedf9f39 3145
b487ef08
PH
3146 idoc = xml.etree.ElementTree.fromstring(indexXml)
3147 itemEls = idoc.findall('.//item')
3148 for itemEl in itemEls:
3149 mediaId = itemEl.findall('./guid')[0].text
3150 shortMediaId = mediaId.split(':')[-1]
3151 showId = mediaId.split(':')[-2].replace('.com', '')
3152 officialTitle = itemEl.findall('./title')[0].text
3153 officialDate = itemEl.findall('./pubDate')[0].text
3154
c8e30044
PH
3155 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3156 urllib.urlencode({'uri': mediaId}))
3157 configReq = urllib2.Request(configUrl)
3158 self.report_config_download(epTitle)
3159 try:
3160 configXml = urllib2.urlopen(configReq).read()
3161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3163 return
46c8c432 3164
c8e30044
PH
3165 cdoc = xml.etree.ElementTree.fromstring(configXml)
3166 turls = []
3167 for rendition in cdoc.findall('.//rendition'):
3168 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3169 turls.append(finfo)
3170
a88bc6bb 3171 if len(turls) == 0:
b487ef08 3172 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
a88bc6bb
PH
3173 continue
3174
c8e30044
PH
3175 # For now, just pick the highest bitrate
3176 format,video_url = turls[-1]
3177
3178 self._downloader.increment_downloads()
a88bc6bb 3179
b487ef08 3180 effTitle = showId + '-' + epTitle
c8e30044 3181 info = {
b487ef08 3182 'id': shortMediaId,
c8e30044 3183 'url': video_url,
b487ef08
PH
3184 'uploader': showId,
3185 'upload_date': officialDate,
a88bc6bb
PH
3186 'title': effTitle,
3187 'stitle': self._simplify_title(effTitle),
c8e30044
PH
3188 'ext': 'mp4',
3189 'format': format,
3190 'thumbnail': None,
b487ef08
PH
3191 'description': officialTitle,
3192 'player_url': playerUrl
c8e30044 3193 }
46c8c432 3194
c8e30044
PH
3195 try:
3196 self._downloader.process_info(info)
3197 except UnavailableVideoError, err:
b487ef08 3198 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
a88bc6bb 3199 continue
c8e30044
PH
3200
3201
f9c68787
PH
3202class EscapistIE(InfoExtractor):
3203 """Information extractor for The Escapist """
3204
b845d58b 3205 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
f3098c4d 3206 IE_NAME = u'escapist'
f9c68787 3207
f9c68787
PH
3208 def report_extraction(self, showName):
3209 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3210
3211 def report_config_download(self, showName):
3212 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3213
f9c68787
PH
3214 def _real_extract(self, url):
3215 htmlParser = HTMLParser.HTMLParser()
3216
3217 mobj = re.match(self._VALID_URL, url)
3218 if mobj is None:
3219 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3220 return
3221 showName = mobj.group('showname')
3222 videoId = mobj.group('episode')
3223
3224 self.report_extraction(showName)
3225 try:
3226 webPage = urllib2.urlopen(url).read()
3227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3228 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3229 return
3230
3231 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3232 description = htmlParser.unescape(descMatch.group(1))
3233 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3234 imgUrl = htmlParser.unescape(imgMatch.group(1))
3235 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3236 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3237 configUrlMatch = re.search('config=(.*)$', playerUrl)
3238 configUrl = urllib2.unquote(configUrlMatch.group(1))
3239
3240 self.report_config_download(showName)
3241 try:
3242 configJSON = urllib2.urlopen(configUrl).read()
3243 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3244 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3245 return
3246
3247 # Technically, it's JavaScript, not JSON
3248 configJSON = configJSON.replace("'", '"')
3249
3250 try:
3251 config = json.loads(configJSON)
3252 except (ValueError,), err:
3253 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3254 return
3255
3256 playlist = config['playlist']
3257 videoUrl = playlist[1]['url']
3258
3259 self._downloader.increment_downloads()
3260 info = {
3261 'id': videoId,
3262 'url': videoUrl,
3263 'uploader': showName,
3264 'upload_date': None,
3265 'title': showName,
e092418d 3266 'stitle': _simplify_title(showName),
f9c68787
PH
3267 'ext': 'flv',
3268 'format': 'flv',
3269 'thumbnail': imgUrl,
3270 'description': description,
3271 'player_url': playerUrl,
3272 }
3273
3274 try:
3275 self._downloader.process_info(info)
3276 except UnavailableVideoError, err:
3277 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3278
3279
8d89fbae
PH
3280class CollegeHumorIE(InfoExtractor):
3281 """Information extractor for collegehumor.com"""
3282
3283 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3284 IE_NAME = u'collegehumor'
3285
3286 def report_webpage(self, video_id):
3287 """Report information extraction."""
3288 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3289
3290 def report_extraction(self, video_id):
3291 """Report information extraction."""
3292 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3293
8d89fbae
PH
3294 def _real_extract(self, url):
3295 htmlParser = HTMLParser.HTMLParser()
3296
3297 mobj = re.match(self._VALID_URL, url)
3298 if mobj is None:
3299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3300 return
3301 video_id = mobj.group('videoid')
3302
3303 self.report_webpage(video_id)
3304 request = urllib2.Request(url)
3305 try:
3306 webpage = urllib2.urlopen(request).read()
3307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3309 return
3310
3311 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3312 if m is None:
3313 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3314 return
3315 internal_video_id = m.group('internalvideoid')
3316
3317 info = {
3318 'id': video_id,
3319 'internal_id': internal_video_id,
3320 }
3321
3322 self.report_extraction(video_id)
3323 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3324 try:
3325 metaXml = urllib2.urlopen(xmlUrl).read()
3326 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3327 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3328 return
3329
3330 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3331 try:
3332 videoNode = mdoc.findall('./video')[0]
3333 info['description'] = videoNode.findall('./description')[0].text
3334 info['title'] = videoNode.findall('./caption')[0].text
e092418d 3335 info['stitle'] = _simplify_title(info['title'])
8d89fbae
PH
3336 info['url'] = videoNode.findall('./file')[0].text
3337 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3338 info['ext'] = info['url'].rpartition('.')[2]
3339 info['format'] = info['ext']
3340 except IndexError:
3341 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3342 return
3343
3344 self._downloader.increment_downloads()
3345
3346 try:
3347 self._downloader.process_info(info)
3348 except UnavailableVideoError, err:
3349 self._downloader.trouble(u'\nERROR: unable to download video')
3350
f9c68787 3351
6501a06d
RB
3352class XVideosIE(InfoExtractor):
3353 """Information extractor for xvideos.com"""
3354
3355 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3356 IE_NAME = u'xvideos'
3357
3358 def report_webpage(self, video_id):
3359 """Report information extraction."""
3360 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3361
3362 def report_extraction(self, video_id):
3363 """Report information extraction."""
3364 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3365
6501a06d
RB
3366 def _real_extract(self, url):
3367 htmlParser = HTMLParser.HTMLParser()
3368
3369 mobj = re.match(self._VALID_URL, url)
3370 if mobj is None:
3371 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3372 return
3373 video_id = mobj.group(1).decode('utf-8')
3374
3375 self.report_webpage(video_id)
3376
a1a8713a 3377 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
6501a06d
RB
3378 try:
3379 webpage = urllib2.urlopen(request).read()
3380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3382 return
3383
3384 self.report_extraction(video_id)
3385
3386
3387 # Extract video URL
3388 mobj = re.search(r'flv_url=(.+?)&', webpage)
3389 if mobj is None:
9f47175a 3390 self._downloader.trouble(u'ERROR: unable to extract video url')
6501a06d
RB
3391 return
3392 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3393
3394
3395 # Extract title
0f9b7722 3396 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
6501a06d
RB
3397 if mobj is None:
3398 self._downloader.trouble(u'ERROR: unable to extract video title')
3399 return
3400 video_title = mobj.group(1).decode('utf-8')
3401
3402
3403 # Extract video thumbnail
3404 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3405 if mobj is None:
3406 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3407 return
3408 video_thumbnail = mobj.group(1).decode('utf-8')
3409
3410
3411
3412 self._downloader.increment_downloads()
3413 info = {
3414 'id': video_id,
3415 'url': video_url,
3416 'uploader': None,
3417 'upload_date': None,
3418 'title': video_title,
e092418d 3419 'stitle': _simplify_title(video_title),
6501a06d
RB
3420 'ext': 'flv',
3421 'format': 'flv',
3422 'thumbnail': video_thumbnail,
3423 'description': None,
3424 'player_url': None,
3425 }
3426
3427 try:
3428 self._downloader.process_info(info)
3429 except UnavailableVideoError, err:
3430 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3431
3432
b20d4f86 3433class SoundcloudIE(InfoExtractor):
073d7a59 3434 """Information extractor for soundcloud.com
b20d4f86
KN
3435 To access the media, the uid of the song and a stream token
3436 must be extracted from the page source and the script must make
3437 a request to media.soundcloud.com/crossdomain.xml. Then
3438 the media can be grabbed by requesting from an url composed
3439 of the stream token and uid
3440 """
ecb3bfe5 3441
40306424 3442 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
ecb3bfe5
KN
3443 IE_NAME = u'soundcloud'
3444
b20d4f86
KN
3445 def __init__(self, downloader=None):
3446 InfoExtractor.__init__(self, downloader)
40306424
KN
3447
3448 def report_webpage(self, video_id):
3449 """Report information extraction."""
3450 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3451
3452 def report_extraction(self, video_id):
3453 """Report information extraction."""
3454 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3455
40306424
KN
3456 def _real_extract(self, url):
3457 htmlParser = HTMLParser.HTMLParser()
3458
3459 mobj = re.match(self._VALID_URL, url)
3460 if mobj is None:
3461 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3462 return
3463
b20d4f86
KN
3464 # extract uploader (which is in the url)
3465 uploader = mobj.group(1).decode('utf-8')
3466 # extract simple title (uploader + slug of song title)
3467 slug_title = mobj.group(2).decode('utf-8')
40306424
KN
3468 simple_title = uploader + '-' + slug_title
3469
3470 self.report_webpage('%s/%s' % (uploader, slug_title))
3471
3472 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3473 try:
3474 webpage = urllib2.urlopen(request).read()
3475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3476 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3477 return
3478
3479 self.report_extraction('%s/%s' % (uploader, slug_title))
3480
ec574c2c 3481 # extract uid and stream token that soundcloud hands out for access
5b3330e0 3482 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
b20d4f86 3483 if mobj:
871be928
KN
3484 video_id = mobj.group(1)
3485 stream_token = mobj.group(2)
b20d4f86 3486
ec574c2c
KN
3487 # extract unsimplified title
3488 mobj = re.search('"title":"(.*?)",', webpage)
3489 if mobj:
3490 title = mobj.group(1)
3491
3492 # construct media url (with uid/token)
b20d4f86
KN
3493 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3494 mediaURL = mediaURL % (video_id, stream_token)
3495
3496 # description
3497 description = u'No description available'
871be928 3498 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
b20d4f86
KN
3499 if mobj:
3500 description = mobj.group(1)
3501
3502 # upload date
871be928
KN
3503 upload_date = None
3504 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
b20d4f86
KN
3505 if mobj:
3506 try:
871be928
KN
3507 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3508 except Exception as e:
3509 print str(e)
b20d4f86 3510
ec574c2c 3511 # for soundcloud, a request to a cross domain is required for cookies
b20d4f86
KN
3512 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3513
3514 try:
3515 self._downloader.process_info({
ec574c2c 3516 'id': video_id.decode('utf-8'),
871be928 3517 'url': mediaURL,
ec574c2c 3518 'uploader': uploader.decode('utf-8'),
073d7a59 3519 'upload_date': upload_date,
ec574c2c
KN
3520 'title': simple_title.decode('utf-8'),
3521 'stitle': simple_title.decode('utf-8'),
40306424
KN
3522 'ext': u'mp3',
3523 'format': u'NA',
3524 'player_url': None,
ec574c2c 3525 'description': description.decode('utf-8')
b20d4f86
KN
3526 })
3527 except UnavailableVideoError:
3528 self._downloader.trouble(u'\nERROR: unable to download video')
ecb3bfe5 3529
208c4b91 3530
3b98a5dd
OA
3531class InfoQIE(InfoExtractor):
3532 """Information extractor for infoq.com"""
3533
3534 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3535 IE_NAME = u'infoq'
3536
3537 def report_webpage(self, video_id):
3538 """Report information extraction."""
3539 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3540
3541 def report_extraction(self, video_id):
3542 """Report information extraction."""
3543 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3544
3b98a5dd
OA
3545 def _real_extract(self, url):
3546 htmlParser = HTMLParser.HTMLParser()
3547
3548 mobj = re.match(self._VALID_URL, url)
3549 if mobj is None:
3550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3551 return
3552
3553 self.report_webpage(url)
3554
3555 request = urllib2.Request(url)
3556 try:
3557 webpage = urllib2.urlopen(request).read()
3558 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3559 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3560 return
3561
3562 self.report_extraction(url)
3563
3564
3565 # Extract video URL
3566 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3567 if mobj is None:
3568 self._downloader.trouble(u'ERROR: unable to extract video url')
3569 return
3570 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3571
3572
3573 # Extract title
3574 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3575 if mobj is None:
3576 self._downloader.trouble(u'ERROR: unable to extract video title')
3577 return
3578 video_title = mobj.group(1).decode('utf-8')
3579
3b98a5dd
OA
3580 # Extract description
3581 video_description = u'No description available.'
3582 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3583 if mobj is not None:
3584 video_description = mobj.group(1).decode('utf-8')
3585
3586 video_filename = video_url.split('/')[-1]
3587 video_id, extension = video_filename.split('.')
3588
3589 self._downloader.increment_downloads()
3590 info = {
3591 'id': video_id,
3592 'url': video_url,
3593 'uploader': None,
3594 'upload_date': None,
3595 'title': video_title,
e092418d 3596 'stitle': _simplify_title(video_title),
3b98a5dd
OA
3597 'ext': extension,
3598 'format': extension, # Extension is always(?) mp4, but seems to be flv
3599 'thumbnail': None,
3600 'description': video_description,
3601 'player_url': None,
3602 }
3603
3604 try:
3605 self._downloader.process_info(info)
3606 except UnavailableVideoError, err:
3607 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3608
3609
00f95a93 3610
65cd34c5
RG
3611class PostProcessor(object):
3612 """Post Processor class.
3613
3614 PostProcessor objects can be added to downloaders with their
3615 add_post_processor() method. When the downloader has finished a
3616 successful download, it will take its internal chain of PostProcessors
3617 and start calling the run() method on each one of them, first with
3618 an initial argument and then with the returned value of the previous
3619 PostProcessor.
3620
3621 The chain will be stopped if one of them ever returns None or the end
3622 of the chain is reached.
3623
3624 PostProcessor objects follow a "mutual registration" process similar
3625 to InfoExtractor objects.
3626 """
3627
3628 _downloader = None
3629
3630 def __init__(self, downloader=None):
3631 self._downloader = downloader
3632
65cd34c5
RG
3633 def set_downloader(self, downloader):
3634 """Sets the downloader for this PP."""
3635 self._downloader = downloader
d3975459 3636
65cd34c5
RG
3637 def run(self, information):
3638 """Run the PostProcessor.
3639
3640 The "information" argument is a dictionary like the ones
2f11508a 3641 composed by InfoExtractors. The only difference is that this
65cd34c5
RG
3642 one has an extra field called "filepath" that points to the
3643 downloaded file.
3644
3645 When this method returns None, the postprocessing chain is
3646 stopped. However, this method may return an information
3647 dictionary that will be passed to the next postprocessing
3648 object in the chain. It can be the one it received after
3649 changing some fields.
3650
3651 In addition, this method may raise a PostProcessingError
3652 exception that will be taken into account by the downloader
3653 it was called from.
3654 """
3655 return information # by default, do nothing
d3975459 3656
c0a10ca8 3657
3072fab1
RG
3658class FFmpegExtractAudioPP(PostProcessor):
3659
c99dcbd2 3660 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3072fab1
RG
3661 PostProcessor.__init__(self, downloader)
3662 if preferredcodec is None:
3663 preferredcodec = 'best'
3664 self._preferredcodec = preferredcodec
18b7f874 3665 self._preferredquality = preferredquality
3666 self._keepvideo = keepvideo
3072fab1
RG
3667
3668 @staticmethod
3669 def get_audio_codec(path):
da273188 3670 try:
2727dbf7
RG
3671 cmd = ['ffprobe', '-show_streams', '--', path]
3672 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
da273188
RG
3673 output = handle.communicate()[0]
3674 if handle.wait() != 0:
3675 return None
3676 except (IOError, OSError):
3072fab1
RG
3677 return None
3678 audio_codec = None
3679 for line in output.split('\n'):
3680 if line.startswith('codec_name='):
3681 audio_codec = line.split('=')[1].strip()
3682 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3683 return audio_codec
3684 return None
3685
3686 @staticmethod
3687 def run_ffmpeg(path, out_path, codec, more_opts):
3688 try:
2727dbf7
RG
3689 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3690 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3072fab1
RG
3691 return (ret == 0)
3692 except (IOError, OSError):
3693 return False
3694
3695 def run(self, information):
3696 path = information['filepath']
3697
3698 filecodec = self.get_audio_codec(path)
3699 if filecodec is None:
da273188 3700 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3072fab1
RG
3701 return None
3702
3703 more_opts = []
3704 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
efb113c7 3705 if filecodec in ['aac', 'mp3', 'vorbis']:
3072fab1
RG
3706 # Lossless if possible
3707 acodec = 'copy'
3708 extension = filecodec
3709 if filecodec == 'aac':
3710 more_opts = ['-f', 'adts']
58384838
RC
3711 if filecodec == 'vorbis':
3712 extension = 'ogg'
3072fab1
RG
3713 else:
3714 # MP3 otherwise.
3715 acodec = 'libmp3lame'
3716 extension = 'mp3'
c99dcbd2
PH
3717 more_opts = []
3718 if self._preferredquality is not None:
3719 more_opts += ['-ab', self._preferredquality]
3072fab1
RG
3720 else:
3721 # We convert the audio (lossy)
58384838 3722 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3072fab1 3723 extension = self._preferredcodec
c99dcbd2
PH
3724 more_opts = []
3725 if self._preferredquality is not None:
3726 more_opts += ['-ab', self._preferredquality]
3072fab1
RG
3727 if self._preferredcodec == 'aac':
3728 more_opts += ['-f', 'adts']
58384838
RC
3729 if self._preferredcodec == 'vorbis':
3730 extension = 'ogg'
3072fab1
RG
3731
3732 (prefix, ext) = os.path.splitext(path)
3733 new_path = prefix + '.' + extension
3734 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3735 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3736
3737 if not status:
1bd92582 3738 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3072fab1
RG
3739 return None
3740
36597dc4
K
3741 # Try to update the date time for extracted audio file.
3742 if information.get('filetime') is not None:
3743 try:
3744 os.utime(new_path, (time.time(), information['filetime']))
3745 except:
3746 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3747
18b7f874 3748 if not self._keepvideo:
3749 try:
3750 os.remove(path)
3751 except (IOError, OSError):
3752 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3753 return None
3072fab1
RG
3754
3755 information['filepath'] = new_path
3756 return information
3757
5fb3df4a
GV
3758
3759def updateSelf(downloader, filename):
3760 ''' Update the program file with the latest version from the repository '''
3761 # Note: downloader only used for options
3762 if not os.access(filename, os.W_OK):
3763 sys.exit('ERROR: no write permissions on %s' % filename)
3764
d207e7cf 3765 downloader.to_screen('Updating to latest version...')
5fb3df4a 3766
4fa74b52 3767 try:
d207e7cf
PH
3768 try:
3769 urlh = urllib.urlopen(UPDATE_URL)
3770 newcontent = urlh.read()
27365956
PH
3771
3772 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3773 if vmatch is not None and vmatch.group(1) == __version__:
3774 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3775 return
d207e7cf
PH
3776 finally:
3777 urlh.close()
5fb3df4a
GV
3778 except (IOError, OSError), err:
3779 sys.exit('ERROR: unable to download latest version')
f9f1e798 3780
5fb3df4a 3781 try:
d207e7cf
PH
3782 outf = open(filename, 'wb')
3783 try:
3784 outf.write(newcontent)
3785 finally:
3786 outf.close()
5fb3df4a
GV
3787 except (IOError, OSError), err:
3788 sys.exit('ERROR: unable to overwrite current version')
4bec29ef 3789
eb6c37da 3790 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
80066952 3791
4f9f96f6
GV
3792def parseOpts():
3793 # Deferred imports
3794 import getpass
3795 import optparse
e7cf18cb 3796
4f9f96f6
GV
3797 def _format_option_string(option):
3798 ''' ('-o', '--option') -> -o, --format METAVAR'''
80066952 3799
4f9f96f6
GV
3800 opts = []
3801
3802 if option._short_opts: opts.append(option._short_opts[0])
3803 if option._long_opts: opts.append(option._long_opts[0])
3804 if len(opts) > 1: opts.insert(1, ', ')
3805
3806 if option.takes_value(): opts.append(' %s' % option.metavar)
3807
3808 return "".join(opts)
3809
6a4f0a11
GV
3810 def _find_term_columns():
3811 columns = os.environ.get('COLUMNS', None)
2c8d32de
PH
3812 if columns:
3813 return int(columns)
3814
4f2a5e06
PH
3815 try:
3816 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3817 out,err = sp.communicate()
eb0387a8 3818 return int(out.split()[1])
4f2a5e06
PH
3819 except:
3820 pass
2c8d32de 3821 return None
6a4f0a11 3822
51c8e53f
GV
3823 max_width = 80
3824 max_help_position = 80
3825
3826 # No need to wrap help messages if we're on a wide console
6a4f0a11 3827 columns = _find_term_columns()
51c8e53f
GV
3828 if columns: max_width = columns
3829
3830 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4f9f96f6
GV
3831 fmt.format_option_strings = _format_option_string
3832
3833 kw = {
3834 'version' : __version__,
3835 'formatter' : fmt,
a2f7e3a5 3836 'usage' : '%prog [options] url [url...]',
4f9f96f6
GV
3837 'conflict_handler' : 'resolve',
3838 }
3839
3840 parser = optparse.OptionParser(**kw)
3841
3842 # option groups
3843 general = optparse.OptionGroup(parser, 'General Options')
20e91e83 3844 selection = optparse.OptionGroup(parser, 'Video Selection')
4f9f96f6
GV
3845 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3846 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3847 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3848 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3849 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3850
3851 general.add_option('-h', '--help',
3852 action='help', help='print this help text and exit')
3853 general.add_option('-v', '--version',
3854 action='version', help='print program version and exit')
3855 general.add_option('-U', '--update',
e0e56865 3856 action='store_true', dest='update_self', help='update this program to latest version')
4f9f96f6
GV
3857 general.add_option('-i', '--ignore-errors',
3858 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3859 general.add_option('-r', '--rate-limit',
3860 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3861 general.add_option('-R', '--retries',
3862 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4f9f96f6
GV
3863 general.add_option('--dump-user-agent',
3864 action='store_true', dest='dump_user_agent',
3865 help='display the current browser identification', default=False)
f3098c4d
PH
3866 general.add_option('--list-extractors',
3867 action='store_true', dest='list_extractors',
3868 help='List all supported extractors and the URLs they would handle', default=False)
4f9f96f6 3869
20e91e83
ABP
3870 selection.add_option('--playlist-start',
3871 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3872 selection.add_option('--playlist-end',
3873 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3874 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3875 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3876
4f9f96f6
GV
3877 authentication.add_option('-u', '--username',
3878 dest='username', metavar='USERNAME', help='account username')
3879 authentication.add_option('-p', '--password',
3880 dest='password', metavar='PASSWORD', help='account password')
3881 authentication.add_option('-n', '--netrc',
3882 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3883
3884
3885 video_format.add_option('-f', '--format',
3886 action='store', dest='format', metavar='FORMAT', help='video format code')
3887 video_format.add_option('--all-formats',
5260e68f 3888 action='store_const', dest='format', help='download all available video formats', const='all')
4f9f96f6
GV
3889 video_format.add_option('--max-quality',
3890 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2761012f
PH
3891 video_format.add_option('-F', '--list-formats',
3892 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4f9f96f6
GV
3893
3894
3895 verbosity.add_option('-q', '--quiet',
3896 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3897 verbosity.add_option('-s', '--simulate',
9b4556c4
PH
3898 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3899 verbosity.add_option('--skip-download',
3900 action='store_true', dest='skip_download', help='do not download the video', default=False)
4f9f96f6
GV
3901 verbosity.add_option('-g', '--get-url',
3902 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3903 verbosity.add_option('-e', '--get-title',
3904 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3905 verbosity.add_option('--get-thumbnail',
3906 action='store_true', dest='getthumbnail',
3907 help='simulate, quiet but print thumbnail URL', default=False)
3908 verbosity.add_option('--get-description',
3909 action='store_true', dest='getdescription',
3910 help='simulate, quiet but print video description', default=False)
3911 verbosity.add_option('--get-filename',
3912 action='store_true', dest='getfilename',
3913 help='simulate, quiet but print output filename', default=False)
da0db53a
DH
3914 verbosity.add_option('--get-format',
3915 action='store_true', dest='getformat',
3916 help='simulate, quiet but print output format', default=False)
4f9f96f6
GV
3917 verbosity.add_option('--no-progress',
3918 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3919 verbosity.add_option('--console-title',
3920 action='store_true', dest='consoletitle',
3921 help='display progress in console titlebar', default=False)
3922
3923
3924 filesystem.add_option('-t', '--title',
3925 action='store_true', dest='usetitle', help='use title in file name', default=False)
3926 filesystem.add_option('-l', '--literal',
3927 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3928 filesystem.add_option('-A', '--auto-number',
3929 action='store_true', dest='autonumber',
3930 help='number downloaded files starting from 00000', default=False)
3931 filesystem.add_option('-o', '--output',
6bde5972 3932 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
4f9f96f6
GV
3933 filesystem.add_option('-a', '--batch-file',
3934 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3935 filesystem.add_option('-w', '--no-overwrites',
3936 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3937 filesystem.add_option('-c', '--continue',
c25303c3 3938 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
18bb3d1e
PH
3939 filesystem.add_option('--no-continue',
3940 action='store_false', dest='continue_dl',
3941 help='do not resume partially downloaded files (restart from beginning)')
4f9f96f6 3942 filesystem.add_option('--cookies',
abb870d1 3943 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4f9f96f6
GV
3944 filesystem.add_option('--no-part',
3945 action='store_true', dest='nopart', help='do not use .part files', default=False)
3946 filesystem.add_option('--no-mtime',
3947 action='store_false', dest='updatetime',
3948 help='do not use the Last-modified header to set the file modification time', default=True)
2c8d32de
PH
3949 filesystem.add_option('--write-description',
3950 action='store_true', dest='writedescription',
3951 help='write video description to a .description file', default=False)
3952 filesystem.add_option('--write-info-json',
3953 action='store_true', dest='writeinfojson',
3954 help='write video metadata to a .info.json file', default=False)
4f9f96f6
GV
3955
3956
3957 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3958 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3959 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
58384838 3960 help='"best", "aac", "vorbis" or "mp3"; best by default')
c99dcbd2
PH
3961 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3962 help='ffmpeg audio bitrate specification, 128k by default')
3963 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3964 help='keeps the video file on disk after the post-processing; the video is erased by default')
4f9f96f6
GV
3965
3966
3967 parser.add_option_group(general)
20e91e83 3968 parser.add_option_group(selection)
4f9f96f6
GV
3969 parser.add_option_group(filesystem)
3970 parser.add_option_group(verbosity)
3971 parser.add_option_group(video_format)
3972 parser.add_option_group(authentication)
3973 parser.add_option_group(postproc)
3974
3975 opts, args = parser.parse_args()
3976
3977 return parser, opts, args
3978
f3098c4d
PH
3979def gen_extractors():
3980 """ Return a list of an instance of every supported extractor.
3981 The order does matter; the first extractor matched is the one handling the URL.
3982 """
3983 youtube_ie = YoutubeIE()
3984 google_ie = GoogleIE()
3985 yahoo_ie = YahooIE()
3986 return [
f3098c4d
PH
3987 YoutubePlaylistIE(youtube_ie),
3988 YoutubeUserIE(youtube_ie),
3989 YoutubeSearchIE(youtube_ie),
1cde6f1d
PH
3990 youtube_ie,
3991 MetacafeIE(youtube_ie),
3992 DailymotionIE(),
f3098c4d
PH
3993 google_ie,
3994 GoogleSearchIE(google_ie),
3995 PhotobucketIE(),
3996 yahoo_ie,
3997 YahooSearchIE(yahoo_ie),
3998 DepositFilesIE(),
3999 FacebookIE(),
4000 BlipTVIE(),
4001 VimeoIE(),
4002 MyVideoIE(),
4003 ComedyCentralIE(),
4004 EscapistIE(),
8d89fbae 4005 CollegeHumorIE(),
6501a06d 4006 XVideosIE(),
38348005 4007 SoundcloudIE(),
3b98a5dd 4008 InfoQIE(),
f3098c4d
PH
4009
4010 GenericIE()
4011 ]
4012
235b3ba4 4013def _real_main():
5adcaa43 4014 parser, opts, args = parseOpts()
4f9f96f6 4015
5adcaa43
GV
4016 # Open appropriate CookieJar
4017 if opts.cookiefile is None:
4018 jar = cookielib.CookieJar()
4019 else:
8cc44341 4020 try:
5adcaa43
GV
4021 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4022 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4023 jar.load()
4024 except (IOError, OSError), err:
4025 sys.exit(u'ERROR: unable to open cookie file')
80066952 4026
5adcaa43
GV
4027 # Dump user agent
4028 if opts.dump_user_agent:
4029 print std_headers['User-Agent']
4030 sys.exit(0)
e7cf18cb 4031
5adcaa43
GV
4032 # Batch file verification
4033 batchurls = []
4034 if opts.batchfile is not None:
8cc44341 4035 try:
5adcaa43
GV
4036 if opts.batchfile == '-':
4037 batchfd = sys.stdin
4bec29ef 4038 else:
5adcaa43
GV
4039 batchfd = open(opts.batchfile, 'r')
4040 batchurls = batchfd.readlines()
4041 batchurls = [x.strip() for x in batchurls]
4042 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4043 except IOError:
4044 sys.exit(u'ERROR: batch file could not be read')
4045 all_urls = batchurls + args
4046
f3098c4d
PH
4047 # General configuration
4048 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4049 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4050 urllib2.install_opener(opener)
4051 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4052
4053 extractors = gen_extractors()
4054
4055 if opts.list_extractors:
4056 for ie in extractors:
4057 print(ie.IE_NAME)
4058 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4059 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4060 for mu in matchedUrls:
4061 print(u' ' + mu)
4062 sys.exit(0)
4063
5adcaa43
GV
4064 # Conflicting, missing and erroneous options
4065 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4066 parser.error(u'using .netrc conflicts with giving username/password')
4067 if opts.password is not None and opts.username is None:
4068 parser.error(u'account username missing')
4069 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4070 parser.error(u'using output template conflicts with using title, literal title or auto number')
4071 if opts.usetitle and opts.useliteral:
4072 parser.error(u'using title conflicts with using literal title')
4073 if opts.username is not None and opts.password is None:
4074 opts.password = getpass.getpass(u'Type account password and press return:')
4075 if opts.ratelimit is not None:
4076 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4077 if numeric_limit is None:
4078 parser.error(u'invalid rate limit specified')
4079 opts.ratelimit = numeric_limit
4080 if opts.retries is not None:
8cc44341 4081 try:
5adcaa43 4082 opts.retries = long(opts.retries)
8cc44341 4083 except (TypeError, ValueError), err:
5adcaa43
GV
4084 parser.error(u'invalid retry count specified')
4085 try:
2c8d32de 4086 opts.playliststart = int(opts.playliststart)
5adcaa43 4087 if opts.playliststart <= 0:
2c8d32de 4088 raise ValueError(u'Playlist start must be positive')
5adcaa43
GV
4089 except (TypeError, ValueError), err:
4090 parser.error(u'invalid playlist start number specified')
4091 try:
2c8d32de 4092 opts.playlistend = int(opts.playlistend)
5adcaa43 4093 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2c8d32de 4094 raise ValueError(u'Playlist end must be greater than playlist start')
5adcaa43
GV
4095 except (TypeError, ValueError), err:
4096 parser.error(u'invalid playlist end number specified')
4097 if opts.extractaudio:
58384838 4098 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
5adcaa43
GV
4099 parser.error(u'invalid audio format specified')
4100
5adcaa43
GV
4101 # File downloader
4102 fd = FileDownloader({
4103 'usenetrc': opts.usenetrc,
4104 'username': opts.username,
4105 'password': opts.password,
da0db53a 4106 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
5adcaa43
GV
4107 'forceurl': opts.geturl,
4108 'forcetitle': opts.gettitle,
4109 'forcethumbnail': opts.getthumbnail,
4110 'forcedescription': opts.getdescription,
4111 'forcefilename': opts.getfilename,
da0db53a 4112 'forceformat': opts.getformat,
9b4556c4 4113 'simulate': opts.simulate,
da0db53a 4114 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
5adcaa43
GV
4115 'format': opts.format,
4116 'format_limit': opts.format_limit,
3de2a1e6 4117 'listformats': opts.listformats,
5adcaa43
GV
4118 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4119 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4120 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4121 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4122 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4123 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4124 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4125 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4126 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4127 or u'%(id)s.%(ext)s'),
4128 'ignoreerrors': opts.ignoreerrors,
4129 'ratelimit': opts.ratelimit,
4130 'nooverwrites': opts.nooverwrites,
4131 'retries': opts.retries,
4132 'continuedl': opts.continue_dl,
4133 'noprogress': opts.noprogress,
4134 'playliststart': opts.playliststart,
4135 'playlistend': opts.playlistend,
4136 'logtostderr': opts.outtmpl == '-',
4137 'consoletitle': opts.consoletitle,
4138 'nopart': opts.nopart,
4139 'updatetime': opts.updatetime,
2c8d32de
PH
4140 'writedescription': opts.writedescription,
4141 'writeinfojson': opts.writeinfojson,
20e91e83
ABP
4142 'matchtitle': opts.matchtitle,
4143 'rejecttitle': opts.rejecttitle,
5adcaa43 4144 })
8c5dc3ad
PH
4145 for extractor in extractors:
4146 fd.add_info_extractor(extractor)
5adcaa43
GV
4147
4148 # PostProcessors
4149 if opts.extractaudio:
c99dcbd2 4150 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
5adcaa43
GV
4151
4152 # Update version
4153 if opts.update_self:
4154 updateSelf(fd, sys.argv[0])
4155
4156 # Maybe do nothing
4157 if len(all_urls) < 1:
4158 if not opts.update_self:
4159 parser.error(u'you must provide at least one URL')
4160 else:
4161 sys.exit()
4162 retcode = fd.download(all_urls)
80066952 4163
5adcaa43
GV
4164 # Dump cookie jar if requested
4165 if opts.cookiefile is not None:
4166 try:
4167 jar.save()
4168 except (IOError, OSError), err:
4169 sys.exit(u'ERROR: unable to save cookie jar')
80066952 4170
5adcaa43 4171 sys.exit(retcode)
80066952 4172
235b3ba4 4173def main():
5adcaa43 4174 try:
235b3ba4 4175 _real_main()
e5bf0f55
RG
4176 except DownloadError:
4177 sys.exit(1)
4178 except SameFileError:
76a7f364 4179 sys.exit(u'ERROR: fixed output name but more than one file to download')
4fa74b52 4180 except KeyboardInterrupt:
76a7f364 4181 sys.exit(u'\nERROR: Interrupted by user')
e9cb9c28 4182
235b3ba4
PH
4183if __name__ == '__main__':
4184 main()
4185
e9cb9c28 4186# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: