youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         )
  15
  16 __license__ = 'Public Domain'
  17 __version__ = '2011.08.28-phihag'
  18
  19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
  20
  21 import cookielib
  22 import datetime
  23 import gzip
  24 import htmlentitydefs
  25 import httplib
  26 import locale
  27 import math
  28 import netrc
  29 import os
  30 import os.path
  31 import re
  32 import socket
  33 import string
  34 import subprocess
  35 import sys
  36 import time
  37 import urllib
  38 import urllib2
  39 import warnings
  40 import zlib
  41
  42 if os.name == 'nt':
  43         import ctypes
  44
  45 try:
  46         import email.utils
  47 except ImportError: # Python 2.4
  48         import email.Utils
  49 try:
  50         import cStringIO as StringIO
  51 except ImportError:
  52         import StringIO
  53
  54 # parse_qs was moved from the cgi module to the urlparse module recently.
  55 try:
  56         from urlparse import parse_qs
  57 except ImportError:
  58         from cgi import parse_qs
  59
  60 try:
  61         import lxml.etree
  62 except ImportError:
  63         pass # Handled below
  64
  65 std_headers = {
  66         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  67         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  68         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  69         'Accept-Encoding': 'gzip, deflate',
  70         'Accept-Language': 'en-us,en;q=0.5',
  71 }
  72
  73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  74
  75 try:
  76         import json
  77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  78         import re
  79         class json(object):
  80                 @staticmethod
  81                 def loads(s):
  82                         s = s.decode('UTF-8')
  83                         def raiseError(msg, i):
  84                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  85                         def skipSpace(i, expectMore=True):
  86                                 while i < len(s) and s[i] in ' \t\r\n':
  87                                         i += 1
  88                                 if expectMore:
  89                                         if i >= len(s):
  90                                                 raiseError('Premature end', i)
  91                                 return i
  92                         def decodeEscape(match):
  93                                 esc = match.group(1)
  94                                 _STATIC = {
  95                                         '"': '"',
  96                                         '\\': '\\',
  97                                         '/': '/',
  98                                         'b': unichr(0x8),
  99                                         'f': unichr(0xc),
 100                                         'n': '\n',
 101                                         'r': '\r',
 102                                         't': '\t',
 103                                 }
 104                                 if esc in _STATIC:
 105                                         return _STATIC[esc]
 106                                 if esc[0] == 'u':
 107                                         if len(esc) == 1+4:
 108                                                 return unichr(int(esc[1:5], 16))
 109                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 110                                                 hi = int(esc[1:5], 16)
 111                                                 low = int(esc[7:11], 16)
 112                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 113                                 raise ValueError('Unknown escape ' + str(esc))
 114                         def parseString(i):
 115                                 i += 1
 116                                 e = i
 117                                 while True:
 118                                         e = s.index('"', e)
 119                                         bslashes = 0
 120                                         while s[e-bslashes-1] == '\\':
 121                                                 bslashes += 1
 122                                         if bslashes % 2 == 1:
 123                                                 e += 1
 124                                                 continue
 125                                         break
 126                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 127                                 stri = rexp.sub(decodeEscape, s[i:e])
 128                                 return (e+1,stri)
 129                         def parseObj(i):
 130                                 i += 1
 131                                 res = {}
 132                                 i = skipSpace(i)
 133                                 if s[i] == '}': # Empty dictionary
 134                                         return (i+1,res)
 135                                 while True:
 136                                         if s[i] != '"':
 137                                                 raiseError('Expected a string object key', i)
 138                                         i,key = parseString(i)
 139                                         i = skipSpace(i)
 140                                         if i >= len(s) or s[i] != ':':
 141                                                 raiseError('Expected a colon', i)
 142                                         i,val = parse(i+1)
 143                                         res[key] = val
 144                                         i = skipSpace(i)
 145                                         if s[i] == '}':
 146                                                 return (i+1, res)
 147                                         if s[i] != ',':
 148                                                 raiseError('Expected comma or closing curly brace', i)
 149                                         i = skipSpace(i+1)
 150                         def parseArray(i):
 151                                 res = []
 152                                 i = skipSpace(i+1)
 153                                 if s[i] == ']': # Empty array
 154                                         return (i+1,res)
 155                                 while True:
 156                                         i,val = parse(i)
 157                                         res.append(val)
 158                                         i = skipSpace(i) # Raise exception if premature end
 159                                         if s[i] == ']':
 160                                                 return (i+1, res)
 161                                         if s[i] != ',':
 162                                                 raiseError('Expected a comma or closing bracket', i)
 163                                         i = skipSpace(i+1)
 164                         def parseDiscrete(i):
 165                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 166                                         if s.startswith(k, i):
 167                                                 return (i+len(k), v)
 168                                 raiseError('Not a boolean (or null)', i)
 169                         def parseNumber(i):
 170                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 171                                 if mobj is None:
 172                                         raiseError('Not a number', i)
 173                                 nums = mobj.group(1)
 174                                 if '.' in nums or 'e' in nums or 'E' in nums:
 175                                         return (i+len(nums), float(nums))
 176                                 return (i+len(nums), int(nums))
 177                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 178                         def parse(i):
 179                                 i = skipSpace(i)
 180                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 181                                 i = skipSpace(i, False)
 182                                 return (i,res)
 183                         i,res = parse(0)
 184                         if i < len(s):
 185                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 186                         return res
 187
 188 def preferredencoding():
 189         """Get preferred encoding.
 190
 191         Returns the best encoding scheme for the system, based on
 192         locale.getpreferredencoding() and some further tweaks.
 193         """
 194         def yield_preferredencoding():
 195                 try:
 196                         pref = locale.getpreferredencoding()
 197                         u'TEST'.encode(pref)
 198                 except:
 199                         pref = 'UTF-8'
 200                 while True:
 201                         yield pref
 202         return yield_preferredencoding().next()
 203
 204 def htmlentity_transform(matchobj):
 205         """Transforms an HTML entity to a Unicode character.
 206
 207         This function receives a match object and is intended to be used with
 208         the re.sub() function.
 209         """
 210         entity = matchobj.group(1)
 211
 212         # Known non-numeric HTML entity
 213         if entity in htmlentitydefs.name2codepoint:
 214                 return unichr(htmlentitydefs.name2codepoint[entity])
 215
 216         # Unicode character
 217         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 218         if mobj is not None:
 219                 numstr = mobj.group(1)
 220                 if numstr.startswith(u'x'):
 221                         base = 16
 222                         numstr = u'0%s' % numstr
 223                 else:
 224                         base = 10
 225                 return unichr(long(numstr, base))
 226
 227         # Unknown entity in name, return its literal representation
 228         return (u'&%s;' % entity)
 229
 230 def sanitize_title(utitle):
 231         """Sanitizes a video title so it could be used as part of a filename."""
 232         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 233         return utitle.replace(unicode(os.sep), u'%')
 234
 235 def sanitize_open(filename, open_mode):
 236         """Try to open the given filename, and slightly tweak it if this fails.
 237
 238         Attempts to open the given filename. If this fails, it tries to change
 239         the filename slightly, step by step, until it's either able to open it
 240         or it fails and raises a final exception, like the standard open()
 241         function.
 242
 243         It returns the tuple (stream, definitive_file_name).
 244         """
 245         try:
 246                 if filename == u'-':
 247                         if sys.platform == 'win32':
 248                                 import msvcrt
 249                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 250                         return (sys.stdout, filename)
 251                 stream = open(filename, open_mode)
 252                 return (stream, filename)
 253         except (IOError, OSError), err:
 254                 # In case of error, try to remove win32 forbidden chars
 255                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 256
 257                 # An exception here should be caught in the caller
 258                 stream = open(filename, open_mode)
 259                 return (stream, filename)
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269 class DownloadError(Exception):
 270         """Download Error exception.
 271
 272         This exception may be thrown by FileDownloader objects if they are not
 273         configured to continue on errors. They will contain the appropriate
 274         error message.
 275         """
 276         pass
 277
 278 class SameFileError(Exception):
 279         """Same File exception.
 280
 281         This exception will be thrown by FileDownloader objects if they detect
 282         multiple files would have to be downloaded to the same file on disk.
 283         """
 284         pass
 285
 286 class PostProcessingError(Exception):
 287         """Post Processing exception.
 288
 289         This exception may be raised by PostProcessor's .run() method to
 290         indicate an error in the postprocessing task.
 291         """
 292         pass
 293
 294 class UnavailableVideoError(Exception):
 295         """Unavailable Format exception.
 296
 297         This exception will be thrown when a video is requested
 298         in a format that is not available for that video.
 299         """
 300         pass
 301
 302 class ContentTooShortError(Exception):
 303         """Content Too Short exception.
 304
 305         This exception may be raised by FileDownloader objects when a file they
 306         download is too small for what the server announced first, indicating
 307         the connection was probably interrupted.
 308         """
 309         # Both in bytes
 310         downloaded = None
 311         expected = None
 312
 313         def __init__(self, downloaded, expected):
 314                 self.downloaded = downloaded
 315                 self.expected = expected
 316
 317 class YoutubeDLHandler(urllib2.HTTPHandler):
 318         """Handler for HTTP requests and responses.
 319
 320         This class, when installed with an OpenerDirector, automatically adds
 321         the standard headers to every HTTP request and handles gzipped and
 322         deflated responses from web servers. If compression is to be avoided in
 323         a particular request, the original request in the program code only has
 324         to include the HTTP header "Youtubedl-No-Compression", which will be
 325         removed before making the real request.
 326
 327         Part of this code was copied from:
 328
 329           http://techknack.net/python-urllib2-handlers/
 330
 331         Andrew Rowls, the author of that code, agreed to release it to the
 332         public domain.
 333         """
 334
 335         @staticmethod
 336         def deflate(data):
 337                 try:
 338                         return zlib.decompress(data, -zlib.MAX_WBITS)
 339                 except zlib.error:
 340                         return zlib.decompress(data)
 341
 342         @staticmethod
 343         def addinfourl_wrapper(stream, headers, url, code):
 344                 if hasattr(urllib2.addinfourl, 'getcode'):
 345                         return urllib2.addinfourl(stream, headers, url, code)
 346                 ret = urllib2.addinfourl(stream, headers, url)
 347                 ret.code = code
 348                 return ret
 349
 350         def http_request(self, req):
 351                 for h in std_headers:
 352                         if h in req.headers:
 353                                 del req.headers[h]
 354                         req.add_header(h, std_headers[h])
 355                 if 'Youtubedl-no-compression' in req.headers:
 356                         if 'Accept-encoding' in req.headers:
 357                                 del req.headers['Accept-encoding']
 358                         del req.headers['Youtubedl-no-compression']
 359                 return req
 360
 361         def http_response(self, req, resp):
 362                 old_resp = resp
 363                 # gzip
 364                 if resp.headers.get('Content-encoding', '') == 'gzip':
 365                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 366                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 367                         resp.msg = old_resp.msg
 368                 # deflate
 369                 if resp.headers.get('Content-encoding', '') == 'deflate':
 370                         gz = StringIO.StringIO(self.deflate(resp.read()))
 371                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 372                         resp.msg = old_resp.msg
 373                 return resp
 374
 375 class FileDownloader(object):
 376         """File Downloader class.
 377
 378         File downloader objects are the ones responsible of downloading the
 379         actual video file and writing it to disk if the user has requested
 380         it, among some other tasks. In most cases there should be one per
 381         program. As, given a video URL, the downloader doesn't know how to
 382         extract all the needed information, task that InfoExtractors do, it
 383         has to pass the URL to one of them.
 384
 385         For this, file downloader objects have a method that allows
 386         InfoExtractors to be registered in a given order. When it is passed
 387         a URL, the file downloader handles it to the first InfoExtractor it
 388         finds that reports being able to handle it. The InfoExtractor extracts
 389         all the information about the video or videos the URL refers to, and
 390         asks the FileDownloader to process the video information, possibly
 391         downloading the video.
 392
 393         File downloaders accept a lot of parameters. In order not to saturate
 394         the object constructor with arguments, it receives a dictionary of
 395         options instead. These options are available through the params
 396         attribute for the InfoExtractors to use. The FileDownloader also
 397         registers itself as the downloader in charge for the InfoExtractors
 398         that are added to it, so this is a "mutual registration".
 399
 400         Available options:
 401
 402         username:         Username for authentication purposes.
 403         password:         Password for authentication purposes.
 404         usenetrc:         Use netrc for authentication instead.
 405         quiet:            Do not print messages to stdout.
 406         forceurl:         Force printing final URL.
 407         forcetitle:       Force printing title.
 408         forcethumbnail:   Force printing thumbnail URL.
 409         forcedescription: Force printing description.
 410         forcefilename:    Force printing final filename.
 411         simulate:         Do not download the video files.
 412         format:           Video format code.
 413         format_limit:     Highest quality format to try.
 414         outtmpl:          Template for output names.
 415         ignoreerrors:     Do not stop on download errors.
 416         ratelimit:        Download speed limit, in bytes/sec.
 417         nooverwrites:     Prevent overwriting files.
 418         retries:          Number of times to retry for HTTP error 5xx
 419         continuedl:       Try to continue downloads if possible.
 420         noprogress:       Do not print the progress bar.
 421         playliststart:    Playlist item to start at.
 422         playlistend:      Playlist item to end at.
 423         logtostderr:      Log messages to stderr instead of stdout.
 424         consoletitle:     Display progress in console window's titlebar.
 425         nopart:           Do not use temporary .part files.
 426         updatetime:       Use the Last-modified header to set output file timestamps.
 427         writedescription: Write the video description to a .description file
 428         writeinfojson:    Write the video description to a .info.json file
 429         """
 430
 431         params = None
 432         _ies = []
 433         _pps = []
 434         _download_retcode = None
 435         _num_downloads = None
 436         _screen_file = None
 437
 438         def __init__(self, params):
 439                 """Create a FileDownloader object with the given options."""
 440                 self._ies = []
 441                 self._pps = []
 442                 self._download_retcode = 0
 443                 self._num_downloads = 0
 444                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 445                 self.params = params
 446
 447         @staticmethod
 448         def pmkdir(filename):
 449                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 450                 components = filename.split(os.sep)
 451                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 452                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 453                 for dir in aggregate:
 454                         if not os.path.exists(dir):
 455                                 os.mkdir(dir)
 456
 457         @staticmethod
 458         def format_bytes(bytes):
 459                 if bytes is None:
 460                         return 'N/A'
 461                 if type(bytes) is str:
 462                         bytes = float(bytes)
 463                 if bytes == 0.0:
 464                         exponent = 0
 465                 else:
 466                         exponent = long(math.log(bytes, 1024.0))
 467                 suffix = 'bkMGTPEZY'[exponent]
 468                 converted = float(bytes) / float(1024**exponent)
 469                 return '%.2f%s' % (converted, suffix)
 470
 471         @staticmethod
 472         def calc_percent(byte_counter, data_len):
 473                 if data_len is None:
 474                         return '---.-%'
 475                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 476
 477         @staticmethod
 478         def calc_eta(start, now, total, current):
 479                 if total is None:
 480                         return '--:--'
 481                 dif = now - start
 482                 if current == 0 or dif < 0.001: # One millisecond
 483                         return '--:--'
 484                 rate = float(current) / dif
 485                 eta = long((float(total) - float(current)) / rate)
 486                 (eta_mins, eta_secs) = divmod(eta, 60)
 487                 if eta_mins > 99:
 488                         return '--:--'
 489                 return '%02d:%02d' % (eta_mins, eta_secs)
 490
 491         @staticmethod
 492         def calc_speed(start, now, bytes):
 493                 dif = now - start
 494                 if bytes == 0 or dif < 0.001: # One millisecond
 495                         return '%10s' % '---b/s'
 496                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 497
 498         @staticmethod
 499         def best_block_size(elapsed_time, bytes):
 500                 new_min = max(bytes / 2.0, 1.0)
 501                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 502                 if elapsed_time < 0.001:
 503                         return long(new_max)
 504                 rate = bytes / elapsed_time
 505                 if rate > new_max:
 506                         return long(new_max)
 507                 if rate < new_min:
 508                         return long(new_min)
 509                 return long(rate)
 510
 511         @staticmethod
 512         def parse_bytes(bytestr):
 513                 """Parse a string indicating a byte quantity into a long integer."""
 514                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 515                 if matchobj is None:
 516                         return None
 517                 number = float(matchobj.group(1))
 518                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 519                 return long(round(number * multiplier))
 520
 521         def add_info_extractor(self, ie):
 522                 """Add an InfoExtractor object to the end of the list."""
 523                 self._ies.append(ie)
 524                 ie.set_downloader(self)
 525
 526         def add_post_processor(self, pp):
 527                 """Add a PostProcessor object to the end of the chain."""
 528                 self._pps.append(pp)
 529                 pp.set_downloader(self)
 530
 531         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 532                 """Print message to stdout if not in quiet mode."""
 533                 try:
 534                         if not self.params.get('quiet', False):
 535                                 terminator = [u'\n', u''][skip_eol]
 536                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 537                         self._screen_file.flush()
 538                 except (UnicodeEncodeError), err:
 539                         if not ignore_encoding_errors:
 540                                 raise
 541
 542         def to_stderr(self, message):
 543                 """Print message to stderr."""
 544                 print >>sys.stderr, message.encode(preferredencoding())
 545
 546         def to_cons_title(self, message):
 547                 """Set console/terminal window title to message."""
 548                 if not self.params.get('consoletitle', False):
 549                         return
 550                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 551                         # c_wchar_p() might not be necessary if `message` is
 552                         # already of type unicode()
 553                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 554                 elif 'TERM' in os.environ:
 555                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 556
 557         def fixed_template(self):
 558                 """Checks if the output template is fixed."""
 559                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 560
 561         def trouble(self, message=None):
 562                 """Determine action to take when a download problem appears.
 563
 564                 Depending on if the downloader has been configured to ignore
 565                 download errors or not, this method may throw an exception or
 566                 not when errors are found, after printing the message.
 567                 """
 568                 if message is not None:
 569                         self.to_stderr(message)
 570                 if not self.params.get('ignoreerrors', False):
 571                         raise DownloadError(message)
 572                 self._download_retcode = 1
 573
 574         def slow_down(self, start_time, byte_counter):
 575                 """Sleep if the download speed is over the rate limit."""
 576                 rate_limit = self.params.get('ratelimit', None)
 577                 if rate_limit is None or byte_counter == 0:
 578                         return
 579                 now = time.time()
 580                 elapsed = now - start_time
 581                 if elapsed <= 0.0:
 582                         return
 583                 speed = float(byte_counter) / elapsed
 584                 if speed > rate_limit:
 585                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 586
 587         def temp_name(self, filename):
 588                 """Returns a temporary filename for the given filename."""
 589                 if self.params.get('nopart', False) or filename == u'-' or \
 590                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 591                         return filename
 592                 return filename + u'.part'
 593
 594         def undo_temp_name(self, filename):
 595                 if filename.endswith(u'.part'):
 596                         return filename[:-len(u'.part')]
 597                 return filename
 598
 599         def try_rename(self, old_filename, new_filename):
 600                 try:
 601                         if old_filename == new_filename:
 602                                 return
 603                         os.rename(old_filename, new_filename)
 604                 except (IOError, OSError), err:
 605                         self.trouble(u'ERROR: unable to rename file')
 606
 607         def try_utime(self, filename, last_modified_hdr):
 608                 """Try to set the last-modified time of the given file."""
 609                 if last_modified_hdr is None:
 610                         return
 611                 if not os.path.isfile(filename):
 612                         return
 613                 timestr = last_modified_hdr
 614                 if timestr is None:
 615                         return
 616                 filetime = timeconvert(timestr)
 617                 if filetime is None:
 618                         return
 619                 try:
 620                         os.utime(filename,(time.time(), filetime))
 621                 except:
 622                         pass
 623
 624         def report_writedescription(self, descfn):
 625                 """ Report that the description file is being written """
 626                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 627
 628         def report_writeinfojson(self, infofn):
 629                 """ Report that the metadata file has been written """
 630                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 631
 632         def report_destination(self, filename):
 633                 """Report destination filename."""
 634                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 635
 636         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 637                 """Report download progress."""
 638                 if self.params.get('noprogress', False):
 639                         return
 640                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 641                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 642                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 643                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 644
 645         def report_resuming_byte(self, resume_len):
 646                 """Report attempt to resume at given byte."""
 647                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 648
 649         def report_retry(self, count, retries):
 650                 """Report retry in case of HTTP error 5xx"""
 651                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 652
 653         def report_file_already_downloaded(self, file_name):
 654                 """Report file has already been fully downloaded."""
 655                 try:
 656                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 657                 except (UnicodeEncodeError), err:
 658                         self.to_screen(u'[download] The file has already been downloaded')
 659
 660         def report_unable_to_resume(self):
 661                 """Report it was impossible to resume download."""
 662                 self.to_screen(u'[download] Unable to resume')
 663
 664         def report_finish(self):
 665                 """Report download finished."""
 666                 if self.params.get('noprogress', False):
 667                         self.to_screen(u'[download] Download completed')
 668                 else:
 669                         self.to_screen(u'')
 670
 671         def increment_downloads(self):
 672                 """Increment the ordinal that assigns a number to each file."""
 673                 self._num_downloads += 1
 674
 675         def prepare_filename(self, info_dict):
 676                 """Generate the output filename."""
 677                 try:
 678                         template_dict = dict(info_dict)
 679                         template_dict['epoch'] = unicode(long(time.time()))
 680                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 681                         filename = self.params['outtmpl'] % template_dict
 682                         return filename
 683                 except (ValueError, KeyError), err:
 684                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 685                         return None
 686
 687         def process_info(self, info_dict):
 688                 """Process a single dictionary returned by an InfoExtractor."""
 689                 filename = self.prepare_filename(info_dict)
 690                 # Do nothing else if in simulate mode
 691                 if self.params.get('simulate', False):
 692                         # Forced printings
 693                         if self.params.get('forcetitle', False):
 694                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 695                         if self.params.get('forceurl', False):
 696                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 697                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 698                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 699                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 700                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 701                         if self.params.get('forcefilename', False) and filename is not None:
 702                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 703
 704                         return
 705
 706                 if filename is None:
 707                         return
 708                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 709                         self.to_stderr(u'WARNING: file exists and will be skipped')
 710                         return
 711
 712                 try:
 713                         self.pmkdir(filename)
 714                 except (OSError, IOError), err:
 715                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 716                         return
 717
 718                 if self.params.get('writedescription', False):
 719                         try:
 720                                 descfn = filename + '.description'
 721                                 self.report_writedescription(descfn)
 722                                 descfile = open(descfn, 'wb')
 723                                 try:
 724                                         descfile.write(info_dict['description'].encode('utf-8'))
 725                                 finally:
 726                                         descfile.close()
 727                         except (OSError, IOError):
 728                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 729                                 return
 730
 731                 if self.params.get('writeinfojson', False):
 732                         infofn = filename + '.info.json'
 733                         self.report_writeinfojson(infofn)
 734                         try:
 735                                 json.dump
 736                         except (NameError,AttributeError):
 737                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 738                                 return
 739                         try:
 740                                 infof = open(infofn, 'wb')
 741                                 try:
 742                                         json.dump(info_dict, infof)
 743                                 finally:
 744                                         infof.close()
 745                         except (OSError, IOError):
 746                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 747                                 return
 748
 749                 try:
 750                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 751                 except (OSError, IOError), err:
 752                         raise UnavailableVideoError
 753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 754                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 755                         return
 756                 except (ContentTooShortError, ), err:
 757                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 758                         return
 759
 760                 if success:
 761                         try:
 762                                 self.post_process(filename, info_dict)
 763                         except (PostProcessingError), err:
 764                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 765                                 return
 766
 767         def download(self, url_list):
 768                 """Download a given list of URLs."""
 769                 if len(url_list) > 1 and self.fixed_template():
 770                         raise SameFileError(self.params['outtmpl'])
 771
 772                 for url in url_list:
 773                         suitable_found = False
 774                         for ie in self._ies:
 775                                 # Go to next InfoExtractor if not suitable
 776                                 if not ie.suitable(url):
 777                                         continue
 778
 779                                 # Suitable InfoExtractor found
 780                                 suitable_found = True
 781
 782                                 # Extract information from URL and process it
 783                                 ie.extract(url)
 784
 785                                 # Suitable InfoExtractor had been found; go to next URL
 786                                 break
 787
 788                         if not suitable_found:
 789                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 790
 791                 return self._download_retcode
 792
 793         def post_process(self, filename, ie_info):
 794                 """Run the postprocessing chain on the given file."""
 795                 info = dict(ie_info)
 796                 info['filepath'] = filename
 797                 for pp in self._pps:
 798                         info = pp.run(info)
 799                         if info is None:
 800                                 break
 801
 802         def _download_with_rtmpdump(self, filename, url, player_url):
 803                 self.report_destination(filename)
 804                 tmpfilename = self.temp_name(filename)
 805
 806                 # Check for rtmpdump first
 807                 try:
 808                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 809                 except (OSError, IOError):
 810                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 811                         return False
 812
 813                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 814                 # the connection was interrumpted and resuming appears to be
 815                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 816                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 817                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 818                 while retval == 2 or retval == 1:
 819                         prevsize = os.path.getsize(tmpfilename)
 820                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 821                         time.sleep(5.0) # This seems to be needed
 822                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 823                         cursize = os.path.getsize(tmpfilename)
 824                         if prevsize == cursize and retval == 1:
 825                                 break
 826                 if retval == 0:
 827                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 828                         self.try_rename(tmpfilename, filename)
 829                         return True
 830                 else:
 831                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 832                         return False
 833
 834         def _do_download(self, filename, url, player_url):
 835                 # Check file already present
 836                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 837                         self.report_file_already_downloaded(filename)
 838                         return True
 839
 840                 # Attempt to download using rtmpdump
 841                 if url.startswith('rtmp'):
 842                         return self._download_with_rtmpdump(filename, url, player_url)
 843
 844                 tmpfilename = self.temp_name(filename)
 845                 stream = None
 846                 open_mode = 'wb'
 847
 848                 # Do not include the Accept-Encoding header
 849                 headers = {'Youtubedl-no-compression': 'True'}
 850                 basic_request = urllib2.Request(url, None, headers)
 851                 request = urllib2.Request(url, None, headers)
 852
 853                 # Establish possible resume length
 854                 if os.path.isfile(tmpfilename):
 855                         resume_len = os.path.getsize(tmpfilename)
 856                 else:
 857                         resume_len = 0
 858
 859                 # Request parameters in case of being able to resume
 860                 if self.params.get('continuedl', False) and resume_len != 0:
 861                         self.report_resuming_byte(resume_len)
 862                         request.add_header('Range','bytes=%d-' % resume_len)
 863                         open_mode = 'ab'
 864
 865                 count = 0
 866                 retries = self.params.get('retries', 0)
 867                 while count <= retries:
 868                         # Establish connection
 869                         try:
 870                                 data = urllib2.urlopen(request)
 871                                 break
 872                         except (urllib2.HTTPError, ), err:
 873                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 874                                         # Unexpected HTTP error
 875                                         raise
 876                                 elif err.code == 416:
 877                                         # Unable to resume (requested range not satisfiable)
 878                                         try:
 879                                                 # Open the connection again without the range header
 880                                                 data = urllib2.urlopen(basic_request)
 881                                                 content_length = data.info()['Content-Length']
 882                                         except (urllib2.HTTPError, ), err:
 883                                                 if err.code < 500 or err.code >= 600:
 884                                                         raise
 885                                         else:
 886                                                 # Examine the reported length
 887                                                 if (content_length is not None and
 888                                                         (resume_len - 100 < long(content_length) < resume_len + 100)):
 889                                                         # The file had already been fully downloaded.
 890                                                         # Explanation to the above condition: in issue #175 it was revealed that
 891                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 892                                                         # changing the file size slightly and causing problems for some users. So
 893                                                         # I decided to implement a suggested change and consider the file
 894                                                         # completely downloaded if the file size differs less than 100 bytes from
 895                                                         # the one in the hard drive.
 896                                                         self.report_file_already_downloaded(filename)
 897                                                         self.try_rename(tmpfilename, filename)
 898                                                         return True
 899                                                 else:
 900                                                         # The length does not match, we start the download over
 901                                                         self.report_unable_to_resume()
 902                                                         open_mode = 'wb'
 903                                                         break
 904                         # Retry
 905                         count += 1
 906                         if count <= retries:
 907                                 self.report_retry(count, retries)
 908
 909                 if count > retries:
 910                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 911                         return False
 912
 913                 data_len = data.info().get('Content-length', None)
 914                 if data_len is not None:
 915                         data_len = long(data_len) + resume_len
 916                 data_len_str = self.format_bytes(data_len)
 917                 byte_counter = 0 + resume_len
 918                 block_size = 1024
 919                 start = time.time()
 920                 while True:
 921                         # Download and write
 922                         before = time.time()
 923                         data_block = data.read(block_size)
 924                         after = time.time()
 925                         if len(data_block) == 0:
 926                                 break
 927                         byte_counter += len(data_block)
 928
 929                         # Open file just in time
 930                         if stream is None:
 931                                 try:
 932                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 933                                         assert stream is not None
 934                                         filename = self.undo_temp_name(tmpfilename)
 935                                         self.report_destination(filename)
 936                                 except (OSError, IOError), err:
 937                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 938                                         return False
 939                         try:
 940                                 stream.write(data_block)
 941                         except (IOError, OSError), err:
 942                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 943                                 return False
 944                         block_size = self.best_block_size(after - before, len(data_block))
 945
 946                         # Progress message
 947                         percent_str = self.calc_percent(byte_counter, data_len)
 948                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 949                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 950                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 951
 952                         # Apply rate limit
 953                         self.slow_down(start, byte_counter - resume_len)
 954
 955                 if stream is None:
 956                         self.trouble(u'\nERROR: Did not get any data blocks')
 957                         return False
 958                 stream.close()
 959                 self.report_finish()
 960                 if data_len is not None and byte_counter != data_len:
 961                         raise ContentTooShortError(byte_counter, long(data_len))
 962                 self.try_rename(tmpfilename, filename)
 963
 964                 # Update file modification time
 965                 if self.params.get('updatetime', True):
 966                         self.try_utime(filename, data.info().get('last-modified', None))
 967
 968                 return True
 969
 970 class InfoExtractor(object):
 971         """Information Extractor class.
 972
 973         Information extractors are the classes that, given a URL, extract
 974         information from the video (or videos) the URL refers to. This
 975         information includes the real video URL, the video title and simplified
 976         title, author and others. The information is stored in a dictionary
 977         which is then passed to the FileDownloader. The FileDownloader
 978         processes this information possibly downloading the video to the file
 979         system, among other possible outcomes. The dictionaries must include
 980         the following fields:
 981
 982         id:             Video identifier.
 983         url:            Final video URL.
 984         uploader:       Nickname of the video uploader.
 985         title:          Literal title.
 986         stitle:         Simplified title.
 987         ext:            Video filename extension.
 988         format:         Video format.
 989         player_url:     SWF Player URL (may be None).
 990
 991         The following fields are optional. Their primary purpose is to allow
 992         youtube-dl to serve as the backend for a video search function, such
 993         as the one in youtube2mp3.  They are only used when their respective
 994         forced printing functions are called:
 995
 996         thumbnail:      Full URL to a video thumbnail image.
 997         description:    One-line video description.
 998
 999         Subclasses of this one should re-define the _real_initialize() and
1000         _real_extract() methods, as well as the suitable() static method.
1001         Probably, they should also be instantiated and added to the main
1002         downloader.
1003         """
1004
1005         _ready = False
1006         _downloader = None
1007
1008         def __init__(self, downloader=None):
1009                 """Constructor. Receives an optional downloader."""
1010                 self._ready = False
1011                 self.set_downloader(downloader)
1012
1013         @staticmethod
1014         def suitable(url):
1015                 """Receives a URL and returns True if suitable for this IE."""
1016                 return False
1017
1018         def initialize(self):
1019                 """Initializes an instance (authentication, etc)."""
1020                 if not self._ready:
1021                         self._real_initialize()
1022                         self._ready = True
1023
1024         def extract(self, url):
1025                 """Extracts URL information and returns it in list of dicts."""
1026                 self.initialize()
1027                 return self._real_extract(url)
1028
1029         def set_downloader(self, downloader):
1030                 """Sets the downloader for this IE."""
1031                 self._downloader = downloader
1032
1033         def _real_initialize(self):
1034                 """Real initialization process. Redefine in subclasses."""
1035                 pass
1036
1037         def _real_extract(self, url):
1038                 """Real extraction process. Redefine in subclasses."""
1039                 pass
1040
1041 class YoutubeIE(InfoExtractor):
1042         """Information extractor for youtube.com."""
1043
1044         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1045         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1046         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1047         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1048         _NETRC_MACHINE = 'youtube'
1049         # Listed in order of quality
1050         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1051         _video_extensions = {
1052                 '13': '3gp',
1053                 '17': 'mp4',
1054                 '18': 'mp4',
1055                 '22': 'mp4',
1056                 '37': 'mp4',
1057                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1058                 '43': 'webm',
1059                 '45': 'webm',
1060         }
1061
1062         @staticmethod
1063         def suitable(url):
1064                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1065
1066         def report_lang(self):
1067                 """Report attempt to set language."""
1068                 self._downloader.to_screen(u'[youtube] Setting language')
1069
1070         def report_login(self):
1071                 """Report attempt to log in."""
1072                 self._downloader.to_screen(u'[youtube] Logging in')
1073
1074         def report_age_confirmation(self):
1075                 """Report attempt to confirm age."""
1076                 self._downloader.to_screen(u'[youtube] Confirming age')
1077
1078         def report_video_webpage_download(self, video_id):
1079                 """Report attempt to download video webpage."""
1080                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1081
1082         def report_video_info_webpage_download(self, video_id):
1083                 """Report attempt to download video info webpage."""
1084                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1085
1086         def report_information_extraction(self, video_id):
1087                 """Report attempt to extract video information."""
1088                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1089
1090         def report_unavailable_format(self, video_id, format):
1091                 """Report extracted video URL."""
1092                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1093
1094         def report_rtmp_download(self):
1095                 """Indicate the download will use the RTMP protocol."""
1096                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1097
1098         def _real_initialize(self):
1099                 if self._downloader is None:
1100                         return
1101
1102                 username = None
1103                 password = None
1104                 downloader_params = self._downloader.params
1105
1106                 # Attempt to use provided username and password or .netrc data
1107                 if downloader_params.get('username', None) is not None:
1108                         username = downloader_params['username']
1109                         password = downloader_params['password']
1110                 elif downloader_params.get('usenetrc', False):
1111                         try:
1112                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1113                                 if info is not None:
1114                                         username = info[0]
1115                                         password = info[2]
1116                                 else:
1117                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1118                         except (IOError, netrc.NetrcParseError), err:
1119                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1120                                 return
1121
1122                 # Set language
1123                 request = urllib2.Request(self._LANG_URL)
1124                 try:
1125                         self.report_lang()
1126                         urllib2.urlopen(request).read()
1127                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1128                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1129                         return
1130
1131                 # No authentication to be performed
1132                 if username is None:
1133                         return
1134
1135                 # Log in
1136                 login_form = {
1137                                 'current_form': 'loginForm',
1138                                 'next':         '/',
1139                                 'action_login': 'Log In',
1140                                 'username':     username,
1141                                 'password':     password,
1142                                 }
1143                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1144                 try:
1145                         self.report_login()
1146                         login_results = urllib2.urlopen(request).read()
1147                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1148                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1149                                 return
1150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1152                         return
1153
1154                 # Confirm age
1155                 age_form = {
1156                                 'next_url':             '/',
1157                                 'action_confirm':       'Confirm',
1158                                 }
1159                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1160                 try:
1161                         self.report_age_confirmation()
1162                         age_results = urllib2.urlopen(request).read()
1163                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1164                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1165                         return
1166
1167         def _real_extract(self, url):
1168                 # Extract video id from URL
1169                 mobj = re.match(self._VALID_URL, url)
1170                 if mobj is None:
1171                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1172                         return
1173                 video_id = mobj.group(2)
1174
1175                 # Get video webpage
1176                 self.report_video_webpage_download(video_id)
1177                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1178                 try:
1179                         video_webpage = urllib2.urlopen(request).read()
1180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1182                         return
1183
1184                 # Attempt to extract SWF player URL
1185                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1186                 if mobj is not None:
1187                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1188                 else:
1189                         player_url = None
1190
1191                 # Get video info
1192                 self.report_video_info_webpage_download(video_id)
1193                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1194                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1195                                            % (video_id, el_type))
1196                         request = urllib2.Request(video_info_url)
1197                         try:
1198                                 video_info_webpage = urllib2.urlopen(request).read()
1199                                 video_info = parse_qs(video_info_webpage)
1200                                 if 'token' in video_info:
1201                                         break
1202                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1204                                 return
1205                 if 'token' not in video_info:
1206                         if 'reason' in video_info:
1207                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1208                         else:
1209                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1210                         return
1211
1212                 # Start extracting information
1213                 self.report_information_extraction(video_id)
1214
1215                 # uploader
1216                 if 'author' not in video_info:
1217                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1218                         return
1219                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1220
1221                 # title
1222                 if 'title' not in video_info:
1223                         self._downloader.trouble(u'ERROR: unable to extract video title')
1224                         return
1225                 video_title = urllib.unquote_plus(video_info['title'][0])
1226                 video_title = video_title.decode('utf-8')
1227                 video_title = sanitize_title(video_title)
1228
1229                 # simplified title
1230                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1231                 simple_title = simple_title.strip(ur'_')
1232
1233                 # thumbnail image
1234                 if 'thumbnail_url' not in video_info:
1235                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1236                         video_thumbnail = ''
1237                 else:   # don't panic if we can't find it
1238                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1239
1240                 # upload date
1241                 upload_date = u'NA'
1242                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1243                 if mobj is not None:
1244                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1245                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1246                         for expression in format_expressions:
1247                                 try:
1248                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1249                                 except:
1250                                         pass
1251
1252                 # description
1253                 try:
1254                         lxml.etree
1255                 except NameError:
1256                         video_description = u'No description available.'
1257                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1258                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1259                                 if mobj is not None:
1260                                         video_description = mobj.group(1).decode('utf-8')
1261                 else:
1262                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1263                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1264                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1265                         # TODO use another parser
1266
1267                 # token
1268                 video_token = urllib.unquote_plus(video_info['token'][0])
1269
1270                 # Decide which formats to download
1271                 req_format = self._downloader.params.get('format', None)
1272
1273                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1274                         self.report_rtmp_download()
1275                         video_url_list = [(None, video_info['conn'][0])]
1276                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1277                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1278                         url_data = [parse_qs(uds) for uds in url_data_strs]
1279                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1280                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1281
1282                         format_limit = self._downloader.params.get('format_limit', None)
1283                         if format_limit is not None and format_limit in self._available_formats:
1284                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1285                         else:
1286                                 format_list = self._available_formats
1287                         existing_formats = [x for x in format_list if x in url_map]
1288                         if len(existing_formats) == 0:
1289                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1290                                 return
1291                         if req_format is None:
1292                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1293                         elif req_format == '-1':
1294                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1295                         else:
1296                                 # Specific format
1297                                 if req_format not in url_map:
1298                                         self._downloader.trouble(u'ERROR: requested format not available')
1299                                         return
1300                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1301                 else:
1302                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1303                         return
1304
1305                 for format_param, video_real_url in video_url_list:
1306                         # At this point we have a new video
1307                         self._downloader.increment_downloads()
1308
1309                         # Extension
1310                         video_extension = self._video_extensions.get(format_param, 'flv')
1311
1312                         try:
1313                                 # Process video information
1314                                 self._downloader.process_info({
1315                                         'id':           video_id.decode('utf-8'),
1316                                         'url':          video_real_url.decode('utf-8'),
1317                                         'uploader':     video_uploader.decode('utf-8'),
1318                                         'upload_date':  upload_date,
1319                                         'title':        video_title,
1320                                         'stitle':       simple_title,
1321                                         'ext':          video_extension.decode('utf-8'),
1322                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1323                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1324                                         'description':  video_description,
1325                                         'player_url':   player_url,
1326                                 })
1327                         except UnavailableVideoError, err:
1328                                 self._downloader.trouble(u'\nERROR: unable to download video')
1329
1330
1331 class MetacafeIE(InfoExtractor):
1332         """Information Extractor for metacafe.com."""
1333
1334         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1335         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1336         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1337         _youtube_ie = None
1338
1339         def __init__(self, youtube_ie, downloader=None):
1340                 InfoExtractor.__init__(self, downloader)
1341                 self._youtube_ie = youtube_ie
1342
1343         @staticmethod
1344         def suitable(url):
1345                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1346
1347         def report_disclaimer(self):
1348                 """Report disclaimer retrieval."""
1349                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1350
1351         def report_age_confirmation(self):
1352                 """Report attempt to confirm age."""
1353                 self._downloader.to_screen(u'[metacafe] Confirming age')
1354
1355         def report_download_webpage(self, video_id):
1356                 """Report webpage download."""
1357                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1358
1359         def report_extraction(self, video_id):
1360                 """Report information extraction."""
1361                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1362
1363         def _real_initialize(self):
1364                 # Retrieve disclaimer
1365                 request = urllib2.Request(self._DISCLAIMER)
1366                 try:
1367                         self.report_disclaimer()
1368                         disclaimer = urllib2.urlopen(request).read()
1369                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1370                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1371                         return
1372
1373                 # Confirm age
1374                 disclaimer_form = {
1375                         'filters': '0',
1376                         'submit': "Continue - I'm over 18",
1377                         }
1378                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1379                 try:
1380                         self.report_age_confirmation()
1381                         disclaimer = urllib2.urlopen(request).read()
1382                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1384                         return
1385
1386         def _real_extract(self, url):
1387                 # Extract id and simplified title from URL
1388                 mobj = re.match(self._VALID_URL, url)
1389                 if mobj is None:
1390                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1391                         return
1392
1393                 video_id = mobj.group(1)
1394
1395                 # Check if video comes from YouTube
1396                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1397                 if mobj2 is not None:
1398                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1399                         return
1400
1401                 # At this point we have a new video
1402                 self._downloader.increment_downloads()
1403
1404                 simple_title = mobj.group(2).decode('utf-8')
1405
1406                 # Retrieve video webpage to extract further information
1407                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1408                 try:
1409                         self.report_download_webpage(video_id)
1410                         webpage = urllib2.urlopen(request).read()
1411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1413                         return
1414
1415                 # Extract URL, uploader and title from webpage
1416                 self.report_extraction(video_id)
1417                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1418                 if mobj is not None:
1419                         mediaURL = urllib.unquote(mobj.group(1))
1420                         video_extension = mediaURL[-3:]
1421
1422                         # Extract gdaKey if available
1423                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1424                         if mobj is None:
1425                                 video_url = mediaURL
1426                         else:
1427                                 gdaKey = mobj.group(1)
1428                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1429                 else:
1430                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1431                         if mobj is None:
1432                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433                                 return
1434                         vardict = parse_qs(mobj.group(1))
1435                         if 'mediaData' not in vardict:
1436                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1437                                 return
1438                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1439                         if mobj is None:
1440                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1441                                 return
1442                         mediaURL = mobj.group(1).replace('\\/', '/')
1443                         video_extension = mediaURL[-3:]
1444                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1445
1446                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1447                 if mobj is None:
1448                         self._downloader.trouble(u'ERROR: unable to extract title')
1449                         return
1450                 video_title = mobj.group(1).decode('utf-8')
1451                 video_title = sanitize_title(video_title)
1452
1453                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1454                 if mobj is None:
1455                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1456                         return
1457                 video_uploader = mobj.group(1)
1458
1459                 try:
1460                         # Process video information
1461                         self._downloader.process_info({
1462                                 'id':           video_id.decode('utf-8'),
1463                                 'url':          video_url.decode('utf-8'),
1464                                 'uploader':     video_uploader.decode('utf-8'),
1465                                 'upload_date':  u'NA',
1466                                 'title':        video_title,
1467                                 'stitle':       simple_title,
1468                                 'ext':          video_extension.decode('utf-8'),
1469                                 'format':       u'NA',
1470                                 'player_url':   None,
1471                         })
1472                 except UnavailableVideoError:
1473                         self._downloader.trouble(u'\nERROR: unable to download video')
1474
1475
1476 class DailymotionIE(InfoExtractor):
1477         """Information Extractor for Dailymotion"""
1478
1479         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1480
1481         def __init__(self, downloader=None):
1482                 InfoExtractor.__init__(self, downloader)
1483
1484         @staticmethod
1485         def suitable(url):
1486                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1487
1488         def report_download_webpage(self, video_id):
1489                 """Report webpage download."""
1490                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1491
1492         def report_extraction(self, video_id):
1493                 """Report information extraction."""
1494                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1495
1496         def _real_initialize(self):
1497                 return
1498
1499         def _real_extract(self, url):
1500                 # Extract id and simplified title from URL
1501                 mobj = re.match(self._VALID_URL, url)
1502                 if mobj is None:
1503                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1504                         return
1505
1506                 # At this point we have a new video
1507                 self._downloader.increment_downloads()
1508                 video_id = mobj.group(1)
1509
1510                 simple_title = mobj.group(2).decode('utf-8')
1511                 video_extension = 'flv'
1512
1513                 # Retrieve video webpage to extract further information
1514                 request = urllib2.Request(url)
1515                 try:
1516                         self.report_download_webpage(video_id)
1517                         webpage = urllib2.urlopen(request).read()
1518                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1520                         return
1521
1522                 # Extract URL, uploader and title from webpage
1523                 self.report_extraction(video_id)
1524                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1525                 if mobj is None:
1526                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1527                         return
1528                 mediaURL = urllib.unquote(mobj.group(1))
1529
1530                 # if needed add http://www.dailymotion.com/ if relative URL
1531
1532                 video_url = mediaURL
1533
1534                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1535                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1536                 if mobj is None:
1537                         self._downloader.trouble(u'ERROR: unable to extract title')
1538                         return
1539                 video_title = mobj.group(1).decode('utf-8')
1540                 video_title = sanitize_title(video_title)
1541
1542                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1543                 if mobj is None:
1544                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1545                         return
1546                 video_uploader = mobj.group(1)
1547
1548                 try:
1549                         # Process video information
1550                         self._downloader.process_info({
1551                                 'id':           video_id.decode('utf-8'),
1552                                 'url':          video_url.decode('utf-8'),
1553                                 'uploader':     video_uploader.decode('utf-8'),
1554                                 'upload_date':  u'NA',
1555                                 'title':        video_title,
1556                                 'stitle':       simple_title,
1557                                 'ext':          video_extension.decode('utf-8'),
1558                                 'format':       u'NA',
1559                                 'player_url':   None,
1560                         })
1561                 except UnavailableVideoError:
1562                         self._downloader.trouble(u'\nERROR: unable to download video')
1563
1564 class GoogleIE(InfoExtractor):
1565         """Information extractor for video.google.com."""
1566
1567         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1568
1569         def __init__(self, downloader=None):
1570                 InfoExtractor.__init__(self, downloader)
1571
1572         @staticmethod
1573         def suitable(url):
1574                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1575
1576         def report_download_webpage(self, video_id):
1577                 """Report webpage download."""
1578                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1579
1580         def report_extraction(self, video_id):
1581                 """Report information extraction."""
1582                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1583
1584         def _real_initialize(self):
1585                 return
1586
1587         def _real_extract(self, url):
1588                 # Extract id from URL
1589                 mobj = re.match(self._VALID_URL, url)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1592                         return
1593
1594                 # At this point we have a new video
1595                 self._downloader.increment_downloads()
1596                 video_id = mobj.group(1)
1597
1598                 video_extension = 'mp4'
1599
1600                 # Retrieve video webpage to extract further information
1601                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1602                 try:
1603                         self.report_download_webpage(video_id)
1604                         webpage = urllib2.urlopen(request).read()
1605                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1606                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1607                         return
1608
1609                 # Extract URL, uploader, and title from webpage
1610                 self.report_extraction(video_id)
1611                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1612                 if mobj is None:
1613                         video_extension = 'flv'
1614                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1615                 if mobj is None:
1616                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1617                         return
1618                 mediaURL = urllib.unquote(mobj.group(1))
1619                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1620                 mediaURL = mediaURL.replace('\\x26', '\x26')
1621
1622                 video_url = mediaURL
1623
1624                 mobj = re.search(r'<title>(.*)</title>', webpage)
1625                 if mobj is None:
1626                         self._downloader.trouble(u'ERROR: unable to extract title')
1627                         return
1628                 video_title = mobj.group(1).decode('utf-8')
1629                 video_title = sanitize_title(video_title)
1630                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1631
1632                 # Extract video description
1633                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1634                 if mobj is None:
1635                         self._downloader.trouble(u'ERROR: unable to extract video description')
1636                         return
1637                 video_description = mobj.group(1).decode('utf-8')
1638                 if not video_description:
1639                         video_description = 'No description available.'
1640
1641                 # Extract video thumbnail
1642                 if self._downloader.params.get('forcethumbnail', False):
1643                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1644                         try:
1645                                 webpage = urllib2.urlopen(request).read()
1646                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1648                                 return
1649                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1650                         if mobj is None:
1651                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1652                                 return
1653                         video_thumbnail = mobj.group(1)
1654                 else:   # we need something to pass to process_info
1655                         video_thumbnail = ''
1656
1657
1658                 try:
1659                         # Process video information
1660                         self._downloader.process_info({
1661                                 'id':           video_id.decode('utf-8'),
1662                                 'url':          video_url.decode('utf-8'),
1663                                 'uploader':     u'NA',
1664                                 'upload_date':  u'NA',
1665                                 'title':        video_title,
1666                                 'stitle':       simple_title,
1667                                 'ext':          video_extension.decode('utf-8'),
1668                                 'format':       u'NA',
1669                                 'player_url':   None,
1670                         })
1671                 except UnavailableVideoError:
1672                         self._downloader.trouble(u'\nERROR: unable to download video')
1673
1674
1675 class PhotobucketIE(InfoExtractor):
1676         """Information extractor for photobucket.com."""
1677
1678         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1679
1680         def __init__(self, downloader=None):
1681                 InfoExtractor.__init__(self, downloader)
1682
1683         @staticmethod
1684         def suitable(url):
1685                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1686
1687         def report_download_webpage(self, video_id):
1688                 """Report webpage download."""
1689                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1690
1691         def report_extraction(self, video_id):
1692                 """Report information extraction."""
1693                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1694
1695         def _real_initialize(self):
1696                 return
1697
1698         def _real_extract(self, url):
1699                 # Extract id from URL
1700                 mobj = re.match(self._VALID_URL, url)
1701                 if mobj is None:
1702                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1703                         return
1704
1705                 # At this point we have a new video
1706                 self._downloader.increment_downloads()
1707                 video_id = mobj.group(1)
1708
1709                 video_extension = 'flv'
1710
1711                 # Retrieve video webpage to extract further information
1712                 request = urllib2.Request(url)
1713                 try:
1714                         self.report_download_webpage(video_id)
1715                         webpage = urllib2.urlopen(request).read()
1716                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718                         return
1719
1720                 # Extract URL, uploader, and title from webpage
1721                 self.report_extraction(video_id)
1722                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1723                 if mobj is None:
1724                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1725                         return
1726                 mediaURL = urllib.unquote(mobj.group(1))
1727
1728                 video_url = mediaURL
1729
1730                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1731                 if mobj is None:
1732                         self._downloader.trouble(u'ERROR: unable to extract title')
1733                         return
1734                 video_title = mobj.group(1).decode('utf-8')
1735                 video_title = sanitize_title(video_title)
1736                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1737
1738                 video_uploader = mobj.group(2).decode('utf-8')
1739
1740                 try:
1741                         # Process video information
1742                         self._downloader.process_info({
1743                                 'id':           video_id.decode('utf-8'),
1744                                 'url':          video_url.decode('utf-8'),
1745                                 'uploader':     video_uploader,
1746                                 'upload_date':  u'NA',
1747                                 'title':        video_title,
1748                                 'stitle':       simple_title,
1749                                 'ext':          video_extension.decode('utf-8'),
1750                                 'format':       u'NA',
1751                                 'player_url':   None,
1752                         })
1753                 except UnavailableVideoError:
1754                         self._downloader.trouble(u'\nERROR: unable to download video')
1755
1756
1757 class YahooIE(InfoExtractor):
1758         """Information extractor for video.yahoo.com."""
1759
1760         # _VALID_URL matches all Yahoo! Video URLs
1761         # _VPAGE_URL matches only the extractable '/watch/' URLs
1762         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1763         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1764
1765         def __init__(self, downloader=None):
1766                 InfoExtractor.__init__(self, downloader)
1767
1768         @staticmethod
1769         def suitable(url):
1770                 return (re.match(YahooIE._VALID_URL, url) is not None)
1771
1772         def report_download_webpage(self, video_id):
1773                 """Report webpage download."""
1774                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1775
1776         def report_extraction(self, video_id):
1777                 """Report information extraction."""
1778                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1779
1780         def _real_initialize(self):
1781                 return
1782
1783         def _real_extract(self, url, new_video=True):
1784                 # Extract ID from URL
1785                 mobj = re.match(self._VALID_URL, url)
1786                 if mobj is None:
1787                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1788                         return
1789
1790                 # At this point we have a new video
1791                 self._downloader.increment_downloads()
1792                 video_id = mobj.group(2)
1793                 video_extension = 'flv'
1794
1795                 # Rewrite valid but non-extractable URLs as
1796                 # extractable English language /watch/ URLs
1797                 if re.match(self._VPAGE_URL, url) is None:
1798                         request = urllib2.Request(url)
1799                         try:
1800                                 webpage = urllib2.urlopen(request).read()
1801                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1803                                 return
1804
1805                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1806                         if mobj is None:
1807                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1808                                 return
1809                         yahoo_id = mobj.group(1)
1810
1811                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1812                         if mobj is None:
1813                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1814                                 return
1815                         yahoo_vid = mobj.group(1)
1816
1817                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1818                         return self._real_extract(url, new_video=False)
1819
1820                 # Retrieve video webpage to extract further information
1821                 request = urllib2.Request(url)
1822                 try:
1823                         self.report_download_webpage(video_id)
1824                         webpage = urllib2.urlopen(request).read()
1825                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1827                         return
1828
1829                 # Extract uploader and title from webpage
1830                 self.report_extraction(video_id)
1831                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1832                 if mobj is None:
1833                         self._downloader.trouble(u'ERROR: unable to extract video title')
1834                         return
1835                 video_title = mobj.group(1).decode('utf-8')
1836                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1837
1838                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1839                 if mobj is None:
1840                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1841                         return
1842                 video_uploader = mobj.group(1).decode('utf-8')
1843
1844                 # Extract video thumbnail
1845                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1848                         return
1849                 video_thumbnail = mobj.group(1).decode('utf-8')
1850
1851                 # Extract video description
1852                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1853                 if mobj is None:
1854                         self._downloader.trouble(u'ERROR: unable to extract video description')
1855                         return
1856                 video_description = mobj.group(1).decode('utf-8')
1857                 if not video_description: video_description = 'No description available.'
1858
1859                 # Extract video height and width
1860                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1861                 if mobj is None:
1862                         self._downloader.trouble(u'ERROR: unable to extract video height')
1863                         return
1864                 yv_video_height = mobj.group(1)
1865
1866                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1867                 if mobj is None:
1868                         self._downloader.trouble(u'ERROR: unable to extract video width')
1869                         return
1870                 yv_video_width = mobj.group(1)
1871
1872                 # Retrieve video playlist to extract media URL
1873                 # I'm not completely sure what all these options are, but we
1874                 # seem to need most of them, otherwise the server sends a 401.
1875                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1876                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1877                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1878                                                                   '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1879                                                                   '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1880                 try:
1881                         self.report_download_webpage(video_id)
1882                         webpage = urllib2.urlopen(request).read()
1883                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1885                         return
1886
1887                 # Extract media URL from playlist XML
1888                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1891                         return
1892                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1893                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1894
1895                 try:
1896                         # Process video information
1897                         self._downloader.process_info({
1898                                 'id':           video_id.decode('utf-8'),
1899                                 'url':          video_url,
1900                                 'uploader':     video_uploader,
1901                                 'upload_date':  u'NA',
1902                                 'title':        video_title,
1903                                 'stitle':       simple_title,
1904                                 'ext':          video_extension.decode('utf-8'),
1905                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1906                                 'description':  video_description,
1907                                 'thumbnail':    video_thumbnail,
1908                                 'description':  video_description,
1909                                 'player_url':   None,
1910                         })
1911                 except UnavailableVideoError:
1912                         self._downloader.trouble(u'\nERROR: unable to download video')
1913
1914
1915 class VimeoIE(InfoExtractor):
1916         """Information extractor for vimeo.com."""
1917
1918         # _VALID_URL matches Vimeo URLs
1919         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1920
1921         def __init__(self, downloader=None):
1922                 InfoExtractor.__init__(self, downloader)
1923
1924         @staticmethod
1925         def suitable(url):
1926                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1927
1928         def report_download_webpage(self, video_id):
1929                 """Report webpage download."""
1930                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1931
1932         def report_extraction(self, video_id):
1933                 """Report information extraction."""
1934                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1935
1936         def _real_initialize(self):
1937                 return
1938
1939         def _real_extract(self, url, new_video=True):
1940                 # Extract ID from URL
1941                 mobj = re.match(self._VALID_URL, url)
1942                 if mobj is None:
1943                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1944                         return
1945
1946                 # At this point we have a new video
1947                 self._downloader.increment_downloads()
1948                 video_id = mobj.group(1)
1949
1950                 # Retrieve video webpage to extract further information
1951                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1952                 try:
1953                         self.report_download_webpage(video_id)
1954                         webpage = urllib2.urlopen(request).read()
1955                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1956                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1957                         return
1958
1959                 # Now we begin extracting as much information as we can from what we
1960                 # retrieved. First we extract the information common to all extractors,
1961                 # and latter we extract those that are Vimeo specific.
1962                 self.report_extraction(video_id)
1963
1964                 # Extract title
1965                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1966                 if mobj is None:
1967                         self._downloader.trouble(u'ERROR: unable to extract video title')
1968                         return
1969                 video_title = mobj.group(1).decode('utf-8')
1970                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1971
1972                 # Extract uploader
1973                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1974                 if mobj is None:
1975                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1976                         return
1977                 video_uploader = mobj.group(1).decode('utf-8')
1978
1979                 # Extract video thumbnail
1980                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1981                 if mobj is None:
1982                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1983                         return
1984                 video_thumbnail = mobj.group(1).decode('utf-8')
1985
1986                 # # Extract video description
1987                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1988                 # if mobj is None:
1989                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1990                 #       return
1991                 # video_description = mobj.group(1).decode('utf-8')
1992                 # if not video_description: video_description = 'No description available.'
1993                 video_description = 'Foo.'
1994
1995                 # Vimeo specific: extract request signature
1996                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1997                 if mobj is None:
1998                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1999                         return
2000                 sig = mobj.group(1).decode('utf-8')
2001
2002                 # Vimeo specific: Extract request signature expiration
2003                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2004                 if mobj is None:
2005                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2006                         return
2007                 sig_exp = mobj.group(1).decode('utf-8')
2008
2009                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2010
2011                 try:
2012                         # Process video information
2013                         self._downloader.process_info({
2014                                 'id':           video_id.decode('utf-8'),
2015                                 'url':          video_url,
2016                                 'uploader':     video_uploader,
2017                                 'upload_date':  u'NA',
2018                                 'title':        video_title,
2019                                 'stitle':       simple_title,
2020                                 'ext':          u'mp4',
2021                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2022                                 'description':  video_description,
2023                                 'thumbnail':    video_thumbnail,
2024                                 'description':  video_description,
2025                                 'player_url':   None,
2026                         })
2027                 except UnavailableVideoError:
2028                         self._downloader.trouble(u'ERROR: unable to download video')
2029
2030
2031 class GenericIE(InfoExtractor):
2032         """Generic last-resort information extractor."""
2033
2034         def __init__(self, downloader=None):
2035                 InfoExtractor.__init__(self, downloader)
2036
2037         @staticmethod
2038         def suitable(url):
2039                 return True
2040
2041         def report_download_webpage(self, video_id):
2042                 """Report webpage download."""
2043                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2044                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2045
2046         def report_extraction(self, video_id):
2047                 """Report information extraction."""
2048                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2049
2050         def _real_initialize(self):
2051                 return
2052
2053         def _real_extract(self, url):
2054                 # At this point we have a new video
2055                 self._downloader.increment_downloads()
2056
2057                 video_id = url.split('/')[-1]
2058                 request = urllib2.Request(url)
2059                 try:
2060                         self.report_download_webpage(video_id)
2061                         webpage = urllib2.urlopen(request).read()
2062                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2063                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2064                         return
2065                 except ValueError, err:
2066                         # since this is the last-resort InfoExtractor, if
2067                         # this error is thrown, it'll be thrown here
2068                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2069                         return
2070
2071                 self.report_extraction(video_id)
2072                 # Start with something easy: JW Player in SWFObject
2073                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2074                 if mobj is None:
2075                         # Broaden the search a little bit
2076                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2077                 if mobj is None:
2078                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2079                         return
2080
2081                 # It's possible that one of the regexes
2082                 # matched, but returned an empty group:
2083                 if mobj.group(1) is None:
2084                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2085                         return
2086
2087                 video_url = urllib.unquote(mobj.group(1))
2088                 video_id  = os.path.basename(video_url)
2089
2090                 # here's a fun little line of code for you:
2091                 video_extension = os.path.splitext(video_id)[1][1:]
2092                 video_id        = os.path.splitext(video_id)[0]
2093
2094                 # it's tempting to parse this further, but you would
2095                 # have to take into account all the variations like
2096                 #   Video Title - Site Name
2097                 #   Site Name | Video Title
2098                 #   Video Title - Tagline | Site Name
2099                 # and so on and so forth; it's just not practical
2100                 mobj = re.search(r'<title>(.*)</title>', webpage)
2101                 if mobj is None:
2102                         self._downloader.trouble(u'ERROR: unable to extract title')
2103                         return
2104                 video_title = mobj.group(1).decode('utf-8')
2105                 video_title = sanitize_title(video_title)
2106                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2107
2108                 # video uploader is domain name
2109                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2110                 if mobj is None:
2111                         self._downloader.trouble(u'ERROR: unable to extract title')
2112                         return
2113                 video_uploader = mobj.group(1).decode('utf-8')
2114
2115                 try:
2116                         # Process video information
2117                         self._downloader.process_info({
2118                                 'id':           video_id.decode('utf-8'),
2119                                 'url':          video_url.decode('utf-8'),
2120                                 'uploader':     video_uploader,
2121                                 'upload_date':  u'NA',
2122                                 'title':        video_title,
2123                                 'stitle':       simple_title,
2124                                 'ext':          video_extension.decode('utf-8'),
2125                                 'format':       u'NA',
2126                                 'player_url':   None,
2127                         })
2128                 except UnavailableVideoError, err:
2129                         self._downloader.trouble(u'\nERROR: unable to download video')
2130
2131
2132 class YoutubeSearchIE(InfoExtractor):
2133         """Information Extractor for YouTube search queries."""
2134         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2135         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2136         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2137         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2138         _youtube_ie = None
2139         _max_youtube_results = 1000
2140
2141         def __init__(self, youtube_ie, downloader=None):
2142                 InfoExtractor.__init__(self, downloader)
2143                 self._youtube_ie = youtube_ie
2144
2145         @staticmethod
2146         def suitable(url):
2147                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2148
2149         def report_download_page(self, query, pagenum):
2150                 """Report attempt to download playlist page with given number."""
2151                 query = query.decode(preferredencoding())
2152                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2153
2154         def _real_initialize(self):
2155                 self._youtube_ie.initialize()
2156
2157         def _real_extract(self, query):
2158                 mobj = re.match(self._VALID_QUERY, query)
2159                 if mobj is None:
2160                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2161                         return
2162
2163                 prefix, query = query.split(':')
2164                 prefix = prefix[8:]
2165                 query  = query.encode('utf-8')
2166                 if prefix == '':
2167                         self._download_n_results(query, 1)
2168                         return
2169                 elif prefix == 'all':
2170                         self._download_n_results(query, self._max_youtube_results)
2171                         return
2172                 else:
2173                         try:
2174                                 n = long(prefix)
2175                                 if n <= 0:
2176                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2177                                         return
2178                                 elif n > self._max_youtube_results:
2179                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2180                                         n = self._max_youtube_results
2181                                 self._download_n_results(query, n)
2182                                 return
2183                         except ValueError: # parsing prefix as integer fails
2184                                 self._download_n_results(query, 1)
2185                                 return
2186
2187         def _download_n_results(self, query, n):
2188                 """Downloads a specified number of results for a query"""
2189
2190                 video_ids = []
2191                 already_seen = set()
2192                 pagenum = 1
2193
2194                 while True:
2195                         self.report_download_page(query, pagenum)
2196                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2197                         request = urllib2.Request(result_url)
2198                         try:
2199                                 page = urllib2.urlopen(request).read()
2200                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2201                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2202                                 return
2203
2204                         # Extract video identifiers
2205                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2206                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2207                                 if video_id not in already_seen:
2208                                         video_ids.append(video_id)
2209                                         already_seen.add(video_id)
2210                                         if len(video_ids) == n:
2211                                                 # Specified n videos reached
2212                                                 for id in video_ids:
2213                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2214                                                 return
2215
2216                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2217                                 for id in video_ids:
2218                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2219                                 return
2220
2221                         pagenum = pagenum + 1
2222
2223 class GoogleSearchIE(InfoExtractor):
2224         """Information Extractor for Google Video search queries."""
2225         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2226         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2227         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2228         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2229         _google_ie = None
2230         _max_google_results = 1000
2231
2232         def __init__(self, google_ie, downloader=None):
2233                 InfoExtractor.__init__(self, downloader)
2234                 self._google_ie = google_ie
2235
2236         @staticmethod
2237         def suitable(url):
2238                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2239
2240         def report_download_page(self, query, pagenum):
2241                 """Report attempt to download playlist page with given number."""
2242                 query = query.decode(preferredencoding())
2243                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2244
2245         def _real_initialize(self):
2246                 self._google_ie.initialize()
2247
2248         def _real_extract(self, query):
2249                 mobj = re.match(self._VALID_QUERY, query)
2250                 if mobj is None:
2251                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2252                         return
2253
2254                 prefix, query = query.split(':')
2255                 prefix = prefix[8:]
2256                 query  = query.encode('utf-8')
2257                 if prefix == '':
2258                         self._download_n_results(query, 1)
2259                         return
2260                 elif prefix == 'all':
2261                         self._download_n_results(query, self._max_google_results)
2262                         return
2263                 else:
2264                         try:
2265                                 n = long(prefix)
2266                                 if n <= 0:
2267                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2268                                         return
2269                                 elif n > self._max_google_results:
2270                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2271                                         n = self._max_google_results
2272                                 self._download_n_results(query, n)
2273                                 return
2274                         except ValueError: # parsing prefix as integer fails
2275                                 self._download_n_results(query, 1)
2276                                 return
2277
2278         def _download_n_results(self, query, n):
2279                 """Downloads a specified number of results for a query"""
2280
2281                 video_ids = []
2282                 already_seen = set()
2283                 pagenum = 1
2284
2285                 while True:
2286                         self.report_download_page(query, pagenum)
2287                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2288                         request = urllib2.Request(result_url)
2289                         try:
2290                                 page = urllib2.urlopen(request).read()
2291                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2292                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2293                                 return
2294
2295                         # Extract video identifiers
2296                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2297                                 video_id = mobj.group(1)
2298                                 if video_id not in already_seen:
2299                                         video_ids.append(video_id)
2300                                         already_seen.add(video_id)
2301                                         if len(video_ids) == n:
2302                                                 # Specified n videos reached
2303                                                 for id in video_ids:
2304                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2305                                                 return
2306
2307                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2308                                 for id in video_ids:
2309                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2310                                 return
2311
2312                         pagenum = pagenum + 1
2313
2314 class YahooSearchIE(InfoExtractor):
2315         """Information Extractor for Yahoo! Video search queries."""
2316         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2317         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2318         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2319         _MORE_PAGES_INDICATOR = r'\s*Next'
2320         _yahoo_ie = None
2321         _max_yahoo_results = 1000
2322
2323         def __init__(self, yahoo_ie, downloader=None):
2324                 InfoExtractor.__init__(self, downloader)
2325                 self._yahoo_ie = yahoo_ie
2326
2327         @staticmethod
2328         def suitable(url):
2329                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2330
2331         def report_download_page(self, query, pagenum):
2332                 """Report attempt to download playlist page with given number."""
2333                 query = query.decode(preferredencoding())
2334                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2335
2336         def _real_initialize(self):
2337                 self._yahoo_ie.initialize()
2338
2339         def _real_extract(self, query):
2340                 mobj = re.match(self._VALID_QUERY, query)
2341                 if mobj is None:
2342                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2343                         return
2344
2345                 prefix, query = query.split(':')
2346                 prefix = prefix[8:]
2347                 query  = query.encode('utf-8')
2348                 if prefix == '':
2349                         self._download_n_results(query, 1)
2350                         return
2351                 elif prefix == 'all':
2352                         self._download_n_results(query, self._max_yahoo_results)
2353                         return
2354                 else:
2355                         try:
2356                                 n = long(prefix)
2357                                 if n <= 0:
2358                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2359                                         return
2360                                 elif n > self._max_yahoo_results:
2361                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2362                                         n = self._max_yahoo_results
2363                                 self._download_n_results(query, n)
2364                                 return
2365                         except ValueError: # parsing prefix as integer fails
2366                                 self._download_n_results(query, 1)
2367                                 return
2368
2369         def _download_n_results(self, query, n):
2370                 """Downloads a specified number of results for a query"""
2371
2372                 video_ids = []
2373                 already_seen = set()
2374                 pagenum = 1
2375
2376                 while True:
2377                         self.report_download_page(query, pagenum)
2378                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2379                         request = urllib2.Request(result_url)
2380                         try:
2381                                 page = urllib2.urlopen(request).read()
2382                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2383                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2384                                 return
2385
2386                         # Extract video identifiers
2387                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2388                                 video_id = mobj.group(1)
2389                                 if video_id not in already_seen:
2390                                         video_ids.append(video_id)
2391                                         already_seen.add(video_id)
2392                                         if len(video_ids) == n:
2393                                                 # Specified n videos reached
2394                                                 for id in video_ids:
2395                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2396                                                 return
2397
2398                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2399                                 for id in video_ids:
2400                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2401                                 return
2402
2403                         pagenum = pagenum + 1
2404
2405 class YoutubePlaylistIE(InfoExtractor):
2406         """Information Extractor for YouTube playlists."""
2407
2408         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2409         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2410         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2411         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2412         _youtube_ie = None
2413
2414         def __init__(self, youtube_ie, downloader=None):
2415                 InfoExtractor.__init__(self, downloader)
2416                 self._youtube_ie = youtube_ie
2417
2418         @staticmethod
2419         def suitable(url):
2420                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2421
2422         def report_download_page(self, playlist_id, pagenum):
2423                 """Report attempt to download playlist page with given number."""
2424                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2425
2426         def _real_initialize(self):
2427                 self._youtube_ie.initialize()
2428
2429         def _real_extract(self, url):
2430                 # Extract playlist id
2431                 mobj = re.match(self._VALID_URL, url)
2432                 if mobj is None:
2433                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2434                         return
2435
2436                 # Single video case
2437                 if mobj.group(3) is not None:
2438                         self._youtube_ie.extract(mobj.group(3))
2439                         return
2440
2441                 # Download playlist pages
2442                 # prefix is 'p' as default for playlists but there are other types that need extra care
2443                 playlist_prefix = mobj.group(1)
2444                 if playlist_prefix == 'a':
2445                         playlist_access = 'artist'
2446                 else:
2447                         playlist_prefix = 'p'
2448                         playlist_access = 'view_play_list'
2449                 playlist_id = mobj.group(2)
2450                 video_ids = []
2451                 pagenum = 1
2452
2453                 while True:
2454                         self.report_download_page(playlist_id, pagenum)
2455                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2456                         try:
2457                                 page = urllib2.urlopen(request).read()
2458                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2460                                 return
2461
2462                         # Extract video identifiers
2463                         ids_in_page = []
2464                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465                                 if mobj.group(1) not in ids_in_page:
2466                                         ids_in_page.append(mobj.group(1))
2467                         video_ids.extend(ids_in_page)
2468
2469                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2470                                 break
2471                         pagenum = pagenum + 1
2472
2473                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2474                 playlistend = self._downloader.params.get('playlistend', -1)
2475                 video_ids = video_ids[playliststart:playlistend]
2476
2477                 for id in video_ids:
2478                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2479                 return
2480
2481 class YoutubeUserIE(InfoExtractor):
2482         """Information Extractor for YouTube users."""
2483
2484         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2485         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2486         _GDATA_PAGE_SIZE = 50
2487         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2488         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489         _youtube_ie = None
2490
2491         def __init__(self, youtube_ie, downloader=None):
2492                 InfoExtractor.__init__(self, downloader)
2493                 self._youtube_ie = youtube_ie
2494
2495         @staticmethod
2496         def suitable(url):
2497                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2498
2499         def report_download_page(self, username, start_index):
2500                 """Report attempt to download user page."""
2501                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2502                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2503
2504         def _real_initialize(self):
2505                 self._youtube_ie.initialize()
2506
2507         def _real_extract(self, url):
2508                 # Extract username
2509                 mobj = re.match(self._VALID_URL, url)
2510                 if mobj is None:
2511                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2512                         return
2513
2514                 username = mobj.group(1)
2515
2516                 # Download video ids using YouTube Data API. Result size per
2517                 # query is limited (currently to 50 videos) so we need to query
2518                 # page by page until there are no video ids - it means we got
2519                 # all of them.
2520
2521                 video_ids = []
2522                 pagenum = 0
2523
2524                 while True:
2525                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2526                         self.report_download_page(username, start_index)
2527
2528                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2529
2530                         try:
2531                                 page = urllib2.urlopen(request).read()
2532                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2533                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2534                                 return
2535
2536                         # Extract video identifiers
2537                         ids_in_page = []
2538
2539                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540                                 if mobj.group(1) not in ids_in_page:
2541                                         ids_in_page.append(mobj.group(1))
2542
2543                         video_ids.extend(ids_in_page)
2544
2545                         # A little optimization - if current page is not
2546                         # "full", ie. does not contain PAGE_SIZE video ids then
2547                         # we can assume that this page is the last one - there
2548                         # are no more ids on further pages - no need to query
2549                         # again.
2550
2551                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2552                                 break
2553
2554                         pagenum += 1
2555
2556                 all_ids_count = len(video_ids)
2557                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2558                 playlistend = self._downloader.params.get('playlistend', -1)
2559
2560                 if playlistend == -1:
2561                         video_ids = video_ids[playliststart:]
2562                 else:
2563                         video_ids = video_ids[playliststart:playlistend]
2564
2565                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2566                                                                   (username, all_ids_count, len(video_ids)))
2567
2568                 for video_id in video_ids:
2569                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2570
2571
2572 class DepositFilesIE(InfoExtractor):
2573         """Information extractor for depositfiles.com"""
2574
2575         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2576
2577         def __init__(self, downloader=None):
2578                 InfoExtractor.__init__(self, downloader)
2579
2580         @staticmethod
2581         def suitable(url):
2582                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2583
2584         def report_download_webpage(self, file_id):
2585                 """Report webpage download."""
2586                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2587
2588         def report_extraction(self, file_id):
2589                 """Report information extraction."""
2590                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2591
2592         def _real_initialize(self):
2593                 return
2594
2595         def _real_extract(self, url):
2596                 # At this point we have a new file
2597                 self._downloader.increment_downloads()
2598
2599                 file_id = url.split('/')[-1]
2600                 # Rebuild url in english locale
2601                 url = 'http://depositfiles.com/en/files/' + file_id
2602
2603                 # Retrieve file webpage with 'Free download' button pressed
2604                 free_download_indication = { 'gateway_result' : '1' }
2605                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2606                 try:
2607                         self.report_download_webpage(file_id)
2608                         webpage = urllib2.urlopen(request).read()
2609                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2610                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2611                         return
2612
2613                 # Search for the real file URL
2614                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2615                 if (mobj is None) or (mobj.group(1) is None):
2616                         # Try to figure out reason of the error.
2617                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2618                         if (mobj is not None) and (mobj.group(1) is not None):
2619                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2620                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2621                         else:
2622                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2623                         return
2624
2625                 file_url = mobj.group(1)
2626                 file_extension = os.path.splitext(file_url)[1][1:]
2627
2628                 # Search for file title
2629                 mobj = re.search(r'<b title="(.*?)">', webpage)
2630                 if mobj is None:
2631                         self._downloader.trouble(u'ERROR: unable to extract title')
2632                         return
2633                 file_title = mobj.group(1).decode('utf-8')
2634
2635                 try:
2636                         # Process file information
2637                         self._downloader.process_info({
2638                                 'id':           file_id.decode('utf-8'),
2639                                 'url':          file_url.decode('utf-8'),
2640                                 'uploader':     u'NA',
2641                                 'upload_date':  u'NA',
2642                                 'title':        file_title,
2643                                 'stitle':       file_title,
2644                                 'ext':          file_extension.decode('utf-8'),
2645                                 'format':       u'NA',
2646                                 'player_url':   None,
2647                         })
2648                 except UnavailableVideoError, err:
2649                         self._downloader.trouble(u'ERROR: unable to download file')
2650
2651 class FacebookIE(InfoExtractor):
2652         """Information Extractor for Facebook"""
2653
2654         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2655         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2656         _NETRC_MACHINE = 'facebook'
2657         _available_formats = ['highqual', 'lowqual']
2658         _video_extensions = {
2659                 'highqual': 'mp4',
2660                 'lowqual': 'mp4',
2661         }
2662
2663         def __init__(self, downloader=None):
2664                 InfoExtractor.__init__(self, downloader)
2665
2666         @staticmethod
2667         def suitable(url):
2668                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2669
2670         def _reporter(self, message):
2671                 """Add header and report message."""
2672                 self._downloader.to_screen(u'[facebook] %s' % message)
2673
2674         def report_login(self):
2675                 """Report attempt to log in."""
2676                 self._reporter(u'Logging in')
2677
2678         def report_video_webpage_download(self, video_id):
2679                 """Report attempt to download video webpage."""
2680                 self._reporter(u'%s: Downloading video webpage' % video_id)
2681
2682         def report_information_extraction(self, video_id):
2683                 """Report attempt to extract video information."""
2684                 self._reporter(u'%s: Extracting video information' % video_id)
2685
2686         def _parse_page(self, video_webpage):
2687                 """Extract video information from page"""
2688                 # General data
2689                 data = {'title': r'class="video_title datawrap">(.*?)</',
2690                         'description': r'<div class="datawrap">(.*?)</div>',
2691                         'owner': r'\("video_owner_name", "(.*?)"\)',
2692                         'upload_date': r'data-date="(.*?)"',
2693                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2694                         }
2695                 video_info = {}
2696                 for piece in data.keys():
2697                         mobj = re.search(data[piece], video_webpage)
2698                         if mobj is not None:
2699                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2700
2701                 # Video urls
2702                 video_urls = {}
2703                 for fmt in self._available_formats:
2704                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2705                         if mobj is not None:
2706                                 # URL is in a Javascript segment inside an escaped Unicode format within
2707                                 # the generally utf-8 page
2708                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2709                 video_info['video_urls'] = video_urls
2710
2711                 return video_info
2712
2713         def _real_initialize(self):
2714                 if self._downloader is None:
2715                         return
2716
2717                 useremail = None
2718                 password = None
2719                 downloader_params = self._downloader.params
2720
2721                 # Attempt to use provided username and password or .netrc data
2722                 if downloader_params.get('username', None) is not None:
2723                         useremail = downloader_params['username']
2724                         password = downloader_params['password']
2725                 elif downloader_params.get('usenetrc', False):
2726                         try:
2727                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2728                                 if info is not None:
2729                                         useremail = info[0]
2730                                         password = info[2]
2731                                 else:
2732                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2733                         except (IOError, netrc.NetrcParseError), err:
2734                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2735                                 return
2736
2737                 if useremail is None:
2738                         return
2739
2740                 # Log in
2741                 login_form = {
2742                         'email': useremail,
2743                         'pass': password,
2744                         'login': 'Log+In'
2745                         }
2746                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2747                 try:
2748                         self.report_login()
2749                         login_results = urllib2.urlopen(request).read()
2750                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2751                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2752                                 return
2753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2755                         return
2756
2757         def _real_extract(self, url):
2758                 mobj = re.match(self._VALID_URL, url)
2759                 if mobj is None:
2760                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2761                         return
2762                 video_id = mobj.group('ID')
2763
2764                 # Get video webpage
2765                 self.report_video_webpage_download(video_id)
2766                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2767                 try:
2768                         page = urllib2.urlopen(request)
2769                         video_webpage = page.read()
2770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2772                         return
2773
2774                 # Start extracting information
2775                 self.report_information_extraction(video_id)
2776
2777                 # Extract information
2778                 video_info = self._parse_page(video_webpage)
2779
2780                 # uploader
2781                 if 'owner' not in video_info:
2782                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2783                         return
2784                 video_uploader = video_info['owner']
2785
2786                 # title
2787                 if 'title' not in video_info:
2788                         self._downloader.trouble(u'ERROR: unable to extract video title')
2789                         return
2790                 video_title = video_info['title']
2791                 video_title = video_title.decode('utf-8')
2792                 video_title = sanitize_title(video_title)
2793
2794                 # simplified title
2795                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2796                 simple_title = simple_title.strip(ur'_')
2797
2798                 # thumbnail image
2799                 if 'thumbnail' not in video_info:
2800                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2801                         video_thumbnail = ''
2802                 else:
2803                         video_thumbnail = video_info['thumbnail']
2804
2805                 # upload date
2806                 upload_date = u'NA'
2807                 if 'upload_date' in video_info:
2808                         upload_time = video_info['upload_date']
2809                         timetuple = email.utils.parsedate_tz(upload_time)
2810                         if timetuple is not None:
2811                                 try:
2812                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2813                                 except:
2814                                         pass
2815
2816                 # description
2817                 video_description = video_info.get('description', 'No description available.')
2818
2819                 url_map = video_info['video_urls']
2820                 if len(url_map.keys()) > 0:
2821                         # Decide which formats to download
2822                         req_format = self._downloader.params.get('format', None)
2823                         format_limit = self._downloader.params.get('format_limit', None)
2824
2825                         if format_limit is not None and format_limit in self._available_formats:
2826                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2827                         else:
2828                                 format_list = self._available_formats
2829                         existing_formats = [x for x in format_list if x in url_map]
2830                         if len(existing_formats) == 0:
2831                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2832                                 return
2833                         if req_format is None:
2834                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2835                         elif req_format == '-1':
2836                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2837                         else:
2838                                 # Specific format
2839                                 if req_format not in url_map:
2840                                         self._downloader.trouble(u'ERROR: requested format not available')
2841                                         return
2842                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2843
2844                 for format_param, video_real_url in video_url_list:
2845
2846                         # At this point we have a new video
2847                         self._downloader.increment_downloads()
2848
2849                         # Extension
2850                         video_extension = self._video_extensions.get(format_param, 'mp4')
2851
2852                         try:
2853                                 # Process video information
2854                                 self._downloader.process_info({
2855                                         'id':           video_id.decode('utf-8'),
2856                                         'url':          video_real_url.decode('utf-8'),
2857                                         'uploader':     video_uploader.decode('utf-8'),
2858                                         'upload_date':  upload_date,
2859                                         'title':        video_title,
2860                                         'stitle':       simple_title,
2861                                         'ext':          video_extension.decode('utf-8'),
2862                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2863                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2864                                         'description':  video_description.decode('utf-8'),
2865                                         'player_url':   None,
2866                                 })
2867                         except UnavailableVideoError, err:
2868                                 self._downloader.trouble(u'\nERROR: unable to download video')
2869
2870 class BlipTVIE(InfoExtractor):
2871         """Information extractor for blip.tv"""
2872
2873         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2874         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2875
2876         @staticmethod
2877         def suitable(url):
2878                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2879
2880         def report_extraction(self, file_id):
2881                 """Report information extraction."""
2882                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2883
2884         def _simplify_title(self, title):
2885                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2886                 res = res.strip(ur'_')
2887                 return res
2888
2889         def _real_extract(self, url):
2890                 mobj = re.match(self._VALID_URL, url)
2891                 if mobj is None:
2892                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2893                         return
2894
2895                 if '?' in url:
2896                         cchar = '&'
2897                 else:
2898                         cchar = '?'
2899                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2900                 request = urllib2.Request(json_url)
2901                 self.report_extraction(mobj.group(1))
2902                 try:
2903                         json_code = urllib2.urlopen(request).read()
2904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2905                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2906                         return
2907                 try:
2908                         json_data = json.loads(json_code)
2909                         if 'Post' in json_data:
2910                                 data = json_data['Post']
2911                         else:
2912                                 data = json_data
2913
2914                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2915                         video_url = data['media']['url']
2916                         umobj = re.match(self._URL_EXT, video_url)
2917                         if umobj is None:
2918                                 raise ValueError('Can not determine filename extension')
2919                         ext = umobj.group(1)
2920
2921                         self._downloader.increment_downloads()
2922
2923                         info = {
2924                                 'id': data['item_id'],
2925                                 'url': video_url,
2926                                 'uploader': data['display_name'],
2927                                 'upload_date': upload_date,
2928                                 'title': data['title'],
2929                                 'stitle': self._simplify_title(data['title']),
2930                                 'ext': ext,
2931                                 'format': data['media']['mimeType'],
2932                                 'thumbnail': data['thumbnailUrl'],
2933                                 'description': data['description'],
2934                                 'player_url': data['embedUrl']
2935                         }
2936                 except (ValueError,KeyError), err:
2937                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2938                         return
2939
2940                 try:
2941                         self._downloader.process_info(info)
2942                 except UnavailableVideoError, err:
2943                         self._downloader.trouble(u'\nERROR: unable to download video')
2944
2945
2946 class PostProcessor(object):
2947         """Post Processor class.
2948
2949         PostProcessor objects can be added to downloaders with their
2950         add_post_processor() method. When the downloader has finished a
2951         successful download, it will take its internal chain of PostProcessors
2952         and start calling the run() method on each one of them, first with
2953         an initial argument and then with the returned value of the previous
2954         PostProcessor.
2955
2956         The chain will be stopped if one of them ever returns None or the end
2957         of the chain is reached.
2958
2959         PostProcessor objects follow a "mutual registration" process similar
2960         to InfoExtractor objects.
2961         """
2962
2963         _downloader = None
2964
2965         def __init__(self, downloader=None):
2966                 self._downloader = downloader
2967
2968         def set_downloader(self, downloader):
2969                 """Sets the downloader for this PP."""
2970                 self._downloader = downloader
2971
2972         def run(self, information):
2973                 """Run the PostProcessor.
2974
2975                 The "information" argument is a dictionary like the ones
2976                 composed by InfoExtractors. The only difference is that this
2977                 one has an extra field called "filepath" that points to the
2978                 downloaded file.
2979
2980                 When this method returns None, the postprocessing chain is
2981                 stopped. However, this method may return an information
2982                 dictionary that will be passed to the next postprocessing
2983                 object in the chain. It can be the one it received after
2984                 changing some fields.
2985
2986                 In addition, this method may raise a PostProcessingError
2987                 exception that will be taken into account by the downloader
2988                 it was called from.
2989                 """
2990                 return information # by default, do nothing
2991
2992 class FFmpegExtractAudioPP(PostProcessor):
2993
2994         def __init__(self, downloader=None, preferredcodec=None):
2995                 PostProcessor.__init__(self, downloader)
2996                 if preferredcodec is None:
2997                         preferredcodec = 'best'
2998                 self._preferredcodec = preferredcodec
2999
3000         @staticmethod
3001         def get_audio_codec(path):
3002                 try:
3003                         cmd = ['ffprobe', '-show_streams', '--', path]
3004                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3005                         output = handle.communicate()[0]
3006                         if handle.wait() != 0:
3007                                 return None
3008                 except (IOError, OSError):
3009                         return None
3010                 audio_codec = None
3011                 for line in output.split('\n'):
3012                         if line.startswith('codec_name='):
3013                                 audio_codec = line.split('=')[1].strip()
3014                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3015                                 return audio_codec
3016                 return None
3017
3018         @staticmethod
3019         def run_ffmpeg(path, out_path, codec, more_opts):
3020                 try:
3021                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3022                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3023                         return (ret == 0)
3024                 except (IOError, OSError):
3025                         return False
3026
3027         def run(self, information):
3028                 path = information['filepath']
3029
3030                 filecodec = self.get_audio_codec(path)
3031                 if filecodec is None:
3032                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3033                         return None
3034
3035                 more_opts = []
3036                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3037                         if filecodec == 'aac' or filecodec == 'mp3':
3038                                 # Lossless if possible
3039                                 acodec = 'copy'
3040                                 extension = filecodec
3041                                 if filecodec == 'aac':
3042                                         more_opts = ['-f', 'adts']
3043                         else:
3044                                 # MP3 otherwise.
3045                                 acodec = 'libmp3lame'
3046                                 extension = 'mp3'
3047                                 more_opts = ['-ab', '128k']
3048                 else:
3049                         # We convert the audio (lossy)
3050                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3051                         extension = self._preferredcodec
3052                         more_opts = ['-ab', '128k']
3053                         if self._preferredcodec == 'aac':
3054                                 more_opts += ['-f', 'adts']
3055
3056                 (prefix, ext) = os.path.splitext(path)
3057                 new_path = prefix + '.' + extension
3058                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3059                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3060
3061                 if not status:
3062                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3063                         return None
3064
3065                 try:
3066                         os.remove(path)
3067                 except (IOError, OSError):
3068                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3069                         return None
3070
3071                 information['filepath'] = new_path
3072                 return information
3073
3074
3075 def updateSelf(downloader, filename):
3076         ''' Update the program file with the latest version from the repository '''
3077         # Note: downloader only used for options
3078         if not os.access(filename, os.W_OK):
3079                 sys.exit('ERROR: no write permissions on %s' % filename)
3080
3081         downloader.to_screen('Updating to latest version...')
3082
3083         try:
3084                 try:
3085                         urlh = urllib.urlopen(UPDATE_URL)
3086                         newcontent = urlh.read()
3087                 finally:
3088                         urlh.close()
3089         except (IOError, OSError), err:
3090                 sys.exit('ERROR: unable to download latest version')
3091
3092         try:
3093                 outf = open(filename, 'wb')
3094                 try:
3095                         outf.write(newcontent)
3096                 finally:
3097                         outf.close()
3098         except (IOError, OSError), err:
3099                 sys.exit('ERROR: unable to overwrite current version')
3100
3101         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3102
3103 def parseOpts():
3104         # Deferred imports
3105         import getpass
3106         import optparse
3107
3108         def _format_option_string(option):
3109                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3110
3111                 opts = []
3112
3113                 if option._short_opts: opts.append(option._short_opts[0])
3114                 if option._long_opts: opts.append(option._long_opts[0])
3115                 if len(opts) > 1: opts.insert(1, ', ')
3116
3117                 if option.takes_value(): opts.append(' %s' % option.metavar)
3118
3119                 return "".join(opts)
3120
3121         def _find_term_columns():
3122                 columns = os.environ.get('COLUMNS', None)
3123                 if columns:
3124                         return int(columns)
3125
3126                 try:
3127                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3128                         out,err = sp.communicate()
3129                         return int(out.split()[1])
3130                 except:
3131                         pass
3132                 return None
3133
3134         max_width = 80
3135         max_help_position = 80
3136
3137         # No need to wrap help messages if we're on a wide console
3138         columns = _find_term_columns()
3139         if columns: max_width = columns
3140
3141         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3142         fmt.format_option_strings = _format_option_string
3143
3144         kw = {
3145                 'version'   : __version__,
3146                 'formatter' : fmt,
3147                 'usage' : '%prog [options] url...',
3148                 'conflict_handler' : 'resolve',
3149         }
3150
3151         parser = optparse.OptionParser(**kw)
3152
3153         # option groups
3154         general        = optparse.OptionGroup(parser, 'General Options')
3155         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3156         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3157         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3158         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3159         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3160
3161         general.add_option('-h', '--help',
3162                         action='help', help='print this help text and exit')
3163         general.add_option('-v', '--version',
3164                         action='version', help='print program version and exit')
3165         general.add_option('-U', '--update',
3166                         action='store_true', dest='update_self', help='update this program to latest version')
3167         general.add_option('-i', '--ignore-errors',
3168                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3169         general.add_option('-r', '--rate-limit',
3170                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3171         general.add_option('-R', '--retries',
3172                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3173         general.add_option('--playlist-start',
3174                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3175         general.add_option('--playlist-end',
3176                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3177         general.add_option('--dump-user-agent',
3178                         action='store_true', dest='dump_user_agent',
3179                         help='display the current browser identification', default=False)
3180
3181         authentication.add_option('-u', '--username',
3182                         dest='username', metavar='USERNAME', help='account username')
3183         authentication.add_option('-p', '--password',
3184                         dest='password', metavar='PASSWORD', help='account password')
3185         authentication.add_option('-n', '--netrc',
3186                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3187
3188
3189         video_format.add_option('-f', '--format',
3190                         action='store', dest='format', metavar='FORMAT', help='video format code')
3191         video_format.add_option('--all-formats',
3192                         action='store_const', dest='format', help='download all available video formats', const='-1')
3193         video_format.add_option('--max-quality',
3194                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3195
3196
3197         verbosity.add_option('-q', '--quiet',
3198                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3199         verbosity.add_option('-s', '--simulate',
3200                         action='store_true', dest='simulate', help='do not download video', default=False)
3201         verbosity.add_option('-g', '--get-url',
3202                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3203         verbosity.add_option('-e', '--get-title',
3204                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3205         verbosity.add_option('--get-thumbnail',
3206                         action='store_true', dest='getthumbnail',
3207                         help='simulate, quiet but print thumbnail URL', default=False)
3208         verbosity.add_option('--get-description',
3209                         action='store_true', dest='getdescription',
3210                         help='simulate, quiet but print video description', default=False)
3211         verbosity.add_option('--get-filename',
3212                         action='store_true', dest='getfilename',
3213                         help='simulate, quiet but print output filename', default=False)
3214         verbosity.add_option('--no-progress',
3215                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3216         verbosity.add_option('--console-title',
3217                         action='store_true', dest='consoletitle',
3218                         help='display progress in console titlebar', default=False)
3219
3220
3221         filesystem.add_option('-t', '--title',
3222                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3223         filesystem.add_option('-l', '--literal',
3224                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3225         filesystem.add_option('-A', '--auto-number',
3226                         action='store_true', dest='autonumber',
3227                         help='number downloaded files starting from 00000', default=False)
3228         filesystem.add_option('-o', '--output',
3229                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3230         filesystem.add_option('-a', '--batch-file',
3231                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3232         filesystem.add_option('-w', '--no-overwrites',
3233                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3234         filesystem.add_option('-c', '--continue',
3235                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3236         filesystem.add_option('--cookies',
3237                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3238         filesystem.add_option('--no-part',
3239                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3240         filesystem.add_option('--no-mtime',
3241                         action='store_false', dest='updatetime',
3242                         help='do not use the Last-modified header to set the file modification time', default=True)
3243         filesystem.add_option('--write-description',
3244                         action='store_true', dest='writedescription',
3245                         help='write video description to a .description file', default=False)
3246         filesystem.add_option('--write-info-json',
3247                         action='store_true', dest='writeinfojson',
3248                         help='write video metadata to a .info.json file', default=False)
3249
3250
3251         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3252                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3253         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3254                         help='"best", "aac" or "mp3"; best by default')
3255
3256
3257         parser.add_option_group(general)
3258         parser.add_option_group(filesystem)
3259         parser.add_option_group(verbosity)
3260         parser.add_option_group(video_format)
3261         parser.add_option_group(authentication)
3262         parser.add_option_group(postproc)
3263
3264         opts, args = parser.parse_args()
3265
3266         return parser, opts, args
3267
3268 def main():
3269         parser, opts, args = parseOpts()
3270
3271         # Open appropriate CookieJar
3272         if opts.cookiefile is None:
3273                 jar = cookielib.CookieJar()
3274         else:
3275                 try:
3276                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3277                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3278                                 jar.load()
3279                 except (IOError, OSError), err:
3280                         sys.exit(u'ERROR: unable to open cookie file')
3281
3282         # Dump user agent
3283         if opts.dump_user_agent:
3284                 print std_headers['User-Agent']
3285                 sys.exit(0)
3286
3287         # General configuration
3288         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3289         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3290         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3291
3292         # Batch file verification
3293         batchurls = []
3294         if opts.batchfile is not None:
3295                 try:
3296                         if opts.batchfile == '-':
3297                                 batchfd = sys.stdin
3298                         else:
3299                                 batchfd = open(opts.batchfile, 'r')
3300                         batchurls = batchfd.readlines()
3301                         batchurls = [x.strip() for x in batchurls]
3302                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3303                 except IOError:
3304                         sys.exit(u'ERROR: batch file could not be read')
3305         all_urls = batchurls + args
3306
3307         # Conflicting, missing and erroneous options
3308         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3309                 parser.error(u'using .netrc conflicts with giving username/password')
3310         if opts.password is not None and opts.username is None:
3311                 parser.error(u'account username missing')
3312         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3313                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3314         if opts.usetitle and opts.useliteral:
3315                 parser.error(u'using title conflicts with using literal title')
3316         if opts.username is not None and opts.password is None:
3317                 opts.password = getpass.getpass(u'Type account password and press return:')
3318         if opts.ratelimit is not None:
3319                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3320                 if numeric_limit is None:
3321                         parser.error(u'invalid rate limit specified')
3322                 opts.ratelimit = numeric_limit
3323         if opts.retries is not None:
3324                 try:
3325                         opts.retries = long(opts.retries)
3326                 except (TypeError, ValueError), err:
3327                         parser.error(u'invalid retry count specified')
3328         try:
3329                 opts.playliststart = int(opts.playliststart)
3330                 if opts.playliststart <= 0:
3331                         raise ValueError(u'Playlist start must be positive')
3332         except (TypeError, ValueError), err:
3333                 parser.error(u'invalid playlist start number specified')
3334         try:
3335                 opts.playlistend = int(opts.playlistend)
3336                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3337                         raise ValueError(u'Playlist end must be greater than playlist start')
3338         except (TypeError, ValueError), err:
3339                 parser.error(u'invalid playlist end number specified')
3340         if opts.extractaudio:
3341                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3342                         parser.error(u'invalid audio format specified')
3343
3344         # Information extractors
3345         youtube_ie = YoutubeIE()
3346         metacafe_ie = MetacafeIE(youtube_ie)
3347         dailymotion_ie = DailymotionIE()
3348         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3349         youtube_user_ie = YoutubeUserIE(youtube_ie)
3350         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3351         google_ie = GoogleIE()
3352         google_search_ie = GoogleSearchIE(google_ie)
3353         photobucket_ie = PhotobucketIE()
3354         yahoo_ie = YahooIE()
3355         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3356         deposit_files_ie = DepositFilesIE()
3357         facebook_ie = FacebookIE()
3358         bliptv_ie = BlipTVIE()
3359         vimeo_ie = VimeoIE()
3360         generic_ie = GenericIE()
3361
3362         # File downloader
3363         fd = FileDownloader({
3364                 'usenetrc': opts.usenetrc,
3365                 'username': opts.username,
3366                 'password': opts.password,
3367                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3368                 'forceurl': opts.geturl,
3369                 'forcetitle': opts.gettitle,
3370                 'forcethumbnail': opts.getthumbnail,
3371                 'forcedescription': opts.getdescription,
3372                 'forcefilename': opts.getfilename,
3373                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3374                 'format': opts.format,
3375                 'format_limit': opts.format_limit,
3376                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3377                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3378                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3379                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3380                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3381                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3382                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3383                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3384                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3385                         or u'%(id)s.%(ext)s'),
3386                 'ignoreerrors': opts.ignoreerrors,
3387                 'ratelimit': opts.ratelimit,
3388                 'nooverwrites': opts.nooverwrites,
3389                 'retries': opts.retries,
3390                 'continuedl': opts.continue_dl,
3391                 'noprogress': opts.noprogress,
3392                 'playliststart': opts.playliststart,
3393                 'playlistend': opts.playlistend,
3394                 'logtostderr': opts.outtmpl == '-',
3395                 'consoletitle': opts.consoletitle,
3396                 'nopart': opts.nopart,
3397                 'updatetime': opts.updatetime,
3398                 'writedescription': opts.writedescription,
3399                 'writeinfojson': opts.writeinfojson,
3400                 })
3401         fd.add_info_extractor(youtube_search_ie)
3402         fd.add_info_extractor(youtube_pl_ie)
3403         fd.add_info_extractor(youtube_user_ie)
3404         fd.add_info_extractor(metacafe_ie)
3405         fd.add_info_extractor(dailymotion_ie)
3406         fd.add_info_extractor(youtube_ie)
3407         fd.add_info_extractor(google_ie)
3408         fd.add_info_extractor(google_search_ie)
3409         fd.add_info_extractor(photobucket_ie)
3410         fd.add_info_extractor(yahoo_ie)
3411         fd.add_info_extractor(yahoo_search_ie)
3412         fd.add_info_extractor(deposit_files_ie)
3413         fd.add_info_extractor(facebook_ie)
3414         fd.add_info_extractor(bliptv_ie)
3415         fd.add_info_extractor(vimeo_ie)
3416
3417         # This must come last since it's the
3418         # fallback if none of the others work
3419         fd.add_info_extractor(generic_ie)
3420
3421         # PostProcessors
3422         if opts.extractaudio:
3423                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3424
3425         # Update version
3426         if opts.update_self:
3427                 updateSelf(fd, sys.argv[0])
3428
3429         # Maybe do nothing
3430         if len(all_urls) < 1:
3431                 if not opts.update_self:
3432                         parser.error(u'you must provide at least one URL')
3433                 else:
3434                         sys.exit()
3435         retcode = fd.download(all_urls)
3436
3437         # Dump cookie jar if requested
3438         if opts.cookiefile is not None:
3439                 try:
3440                         jar.save()
3441                 except (IOError, OSError), err:
3442                         sys.exit(u'ERROR: unable to save cookie jar')
3443
3444         sys.exit(retcode)
3445
3446
3447 if __name__ == '__main__':
3448         try:
3449                 main()
3450         except DownloadError:
3451                 sys.exit(1)
3452         except SameFileError:
3453                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3454         except KeyboardInterrupt:
3455                 sys.exit(u'\nERROR: Interrupted by user')
3456
3457 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: